4️⃣Feature Extraction

Feature Extraction

NLP에서 특징 추출은 원시 텍스트 데이터를 기계 학습 모델에서 사용할 수 있는 수치 표현으로 변환하는 프로세스를 말합니다.

텐서의 모양은 입력 텍스트의 토큰 수와 BERT 모델에서 숨겨진 레이어의 크기에 따라 달라집니다.

"bert-base-uncased" 모델의 경우 각 토큰은 768차원 벡터로 표현되므로 결과의 모양은 (1, number_of_tokens, 768)이 되며, 여기서 1은 배치 크기가 1임을 나타냅니다(단일 문장을 처리)

from transformers import pipeline

extractor = pipeline(
    model="google-bert/bert-base-uncased", 
    task="feature-extraction"
)
result = extractor("This is a simple test.", return_tensors=True)
result.shape
> torch.Size([1, 8, 768])
from transformers import pipeline

checkpoint = "facebook/bart-base"
feature_extractor = pipeline(
    "feature-extraction",
    framework="pt",
    model=checkpoint
)
text = "Transformers is an awesome library!"

#Reducing along the first dimension to get a 768 dimensional array
result = feature_extractor(text,return_tensors = "pt")[0].numpy().mean(axis=0)
result
array([ 2.65979469e-01, -6.56846821e-01,  1.25627622e-01,  1.36601961e+00,
       -7.59062350e-01,  7.56494999e-01, -1.05783379e+00, -1.87183488e-02,
       -6.71836197e-01, -1.96252692e+00,  2.81317592e-01,  1.35964262e+00,
       -7.21477158e-03,  9.12114024e-01,  3.43531281e-01, -1.35358238e+00,
       -7.05923915e-01,  1.40108848e+00,  9.22238588e-01, -1.89351216e-01,
        2.40267381e-01, -2.19243467e-01,  9.30557787e-01,  7.81986415e-01,
       -1.01491988e+00, -5.85219324e-01,  6.38441265e-01, -4.86978149e+00,
       -1.68945134e-01, -3.85040236e+00, -9.25553560e-01, -1.71881378e-01,
        5.59025332e-02, -6.87882125e-01, -1.03691256e+00, -3.37629229e-01,
        4.45500523e-01,  2.23247707e-01,  5.60621440e-01,  4.59313184e-01,
        2.38988829e+00,  6.36417508e-01, -8.32767725e-01,  3.60407889e-01,
       -2.03047842e-01,  9.03682828e-01, -1.65837502e+00,  1.19581902e+00,
       -2.48081341e-01,  8.12374890e-01, -4.88460273e-01,  2.99310595e-01,
        5.61737835e-01, -3.31205308e-01, -1.28257060e+00, -4.96809697e-03,
        7.20308483e-01, -4.77443188e-02,  4.61459041e-01,  1.31555617e+00,
       -2.52127767e-01, -2.13813365e-01, -8.04339498e-02, -7.61193752e-01,
       -5.84884942e-01, -7.94288933e-01, -7.32236505e-01,  1.45702612e+00,
        7.83099055e-01, -4.38804209e-01,  3.54020238e-01,  7.75779635e-02,
        1.90865564e+00, -1.15279332e-01, -2.89797974e+00,  4.35574591e-01,
        1.95469415e+00, -1.57004988e+00, -6.98929250e-01,  1.97130308e-01,
        2.16544896e-01,  1.79927900e-01, -4.03966725e-01, -5.33977710e-02,
        4.32771295e-02, -6.61527812e-01, -7.12189525e-02, -4.96350408e-01,
        3.23695511e-01,  3.63479644e-01, -7.13343740e-01,  9.46864963e-01,
       -4.25351888e-01, -1.62067080e+00,  4.80425507e-01,  6.41202092e-01,
        2.59259790e-01,  5.41801155e-01, -3.12652916e-01, -7.57046163e-01,
        1.49013770e+00,  2.58979249e+00, -1.38797617e+00,  6.22176707e-01,
       -4.93953610e-03,  1.16229892e+00,  1.27965415e+00, -7.59564459e-01,
        4.44990993e-01, -1.19959974e+00, -2.21594334e-01, -1.01445842e+00,
       -1.89106047e-01, -3.31210196e-01,  8.83097172e-01, -1.40546516e-01,
        3.58635008e-01, -4.30313945e-01,  1.20075715e+00, -3.19993407e-01,
        1.29259288e+00,  1.60403013e+00, -1.70512187e+00,  4.31367129e-01,
       -4.54189837e-01,  1.58637106e-01, -7.68502951e-01,  5.44382691e-01,
       -7.79063642e-01,  7.52042174e-01,  3.63594383e-01,  1.50957608e+00,
        4.73330081e-01,  5.57976484e-01,  3.91978323e-01, -1.21539712e+00,
       -1.84798136e-01,  8.70272458e-01, -1.24984539e+00, -3.81815940e-01,
       -2.21528202e-01, -2.32662141e-01,  1.01794183e+00,  4.10634369e-01,
       -2.76572406e-01, -7.22168162e-02,  7.32802749e-02, -4.83183920e-01,
        2.79749595e-02, -1.90762311e-01, -2.02779293e+00, -1.51625490e-02,
        2.34252289e-01,  1.20170951e+00,  5.16299963e-01, -1.41489655e-01,
       -2.46584147e-01,  1.91934600e-01, -2.89843321e-01,  7.42309749e-01,
       -2.88151741e-01, -8.30338299e-01,  1.08718443e+00,  7.55699515e-01,
        1.44116759e-01, -1.84054241e-01, -9.06795204e-01,  2.49311715e-01,
       -3.42900723e-01,  8.78563583e-01,  3.04132570e-02,  1.34575880e+00,
       -1.85381448e+00,  6.08434156e-02, -1.42206645e+00,  6.67507231e-01,
        1.34545970e+00,  4.45354342e-01,  2.74102747e-01, -1.42620504e+00,
        1.66471791e+00,  4.83929425e-01, -3.44172597e-01,  7.31653988e-01,
       -1.34243101e-01,  3.96722883e-01,  6.22375488e-01,  3.31292927e-01,
       -3.20087582e-01, -1.07533097e+00,  9.76715624e-01, -3.52599472e-01,
       -1.31508529e-01, -1.68829501e-01, -1.38619637e+00,  8.90181839e-01,
       -5.07539250e-02,  3.07513736e-02, -3.93885255e-01, -1.71657515e+00,
        1.08871162e+00,  4.53436106e-01,  1.21618068e+00,  5.08168757e-01,
       -6.95172369e-01, -9.72706795e-01, -6.98203564e-01, -3.77047509e-02,
        5.14743686e-01,  7.26660311e-01,  1.42454967e-01,  1.70566046e+00,
        2.11760625e-01,  7.90104866e-01, -1.11895156e+00,  9.30911675e-02,
       -8.53542924e-01, -3.47614557e-01, -8.34994614e-01, -2.87239719e-02,
        1.74926448e+00, -1.14972591e+00,  7.42217958e-01, -5.98051906e-01,
        4.17294323e-01,  9.28575099e-02, -1.65214360e+00,  2.74253279e-01,
       -6.29156351e-01, -6.69885874e-01, -6.18473053e-01,  4.31621671e-01,
       -5.20275354e-01,  1.36028194e+00,  1.14785850e+00, -7.18702853e-01,
        7.51556270e-03,  1.28168547e+00,  4.58477229e-01, -6.67250633e-01,
       -8.71611178e-01,  1.54485416e+00,  9.50916409e-01,  1.56043792e+00,
        4.03211296e-01,  1.39242363e+00, -1.03513181e-01,  5.22949100e-01,
        6.63481355e-01, -2.78556813e-02, -7.29202271e-01, -3.53717595e-01,
       -1.51166081e+00, -1.01856375e+00,  6.35557413e-01, -1.74585164e+00,
        1.15104437e+00,  1.59911370e+00,  1.51997781e+00,  3.44798148e-01,
       -1.03017783e+00, -3.45427930e-01, -3.04957658e-01, -1.52942911e-01,
       -5.44561386e-01,  2.72866637e-01, -4.63098109e-01,  1.88960946e+00,
       -6.39785528e-01,  4.92546350e-01,  3.46445173e-01,  1.83566287e-01,
        3.37440491e-01,  8.80383372e-01, -1.57478404e+00,  2.41358012e-01,
        3.42918366e-01, -1.77611008e-01, -1.44021642e+00, -4.05111313e-01,
        9.36335549e-02,  1.14355230e+00,  5.38416505e-01, -1.01677418e+00,
       -6.91158831e-01,  1.24062963e-01,  2.08523095e-01,  9.39111337e-02,
        9.54437554e-01, -9.73225534e-02,  1.23226786e+00,  1.23787582e+00,
       -3.61190140e-01,  9.04121161e-01,  1.90456226e-01,  3.66956741e-01,
       -8.37930024e-01,  5.74374676e-01, -1.30378753e-01,  1.15551674e+00,
        7.32960701e-01, -6.51617289e-01,  1.14175498e+00,  4.53374743e-01,
       -1.02572456e-01,  1.62613416e+00,  2.79911518e-01, -9.57020462e-01,
       -1.28427362e+00, -3.79492283e-01,  1.21321309e+00, -6.52288198e-01,
       -7.16957331e-01,  8.21545362e-01, -2.78386503e-01,  1.13116992e+00,
        1.09877765e+00, -9.30897295e-01,  3.44983310e-01,  8.14800978e-01,
       -4.39907372e-01,  3.38642240e-01, -4.09136683e-01,  1.71418175e-01,
        5.85774332e-02,  6.26304924e-01,  1.14245892e+00, -7.30417669e-02,
        4.70405728e-01,  2.00712815e-01, -1.10264778e+00,  6.69935524e-01,
        1.26470280e+00,  9.37951952e-02, -9.04536903e-01,  7.71072134e-02,
        6.09872103e-01,  4.51007843e-01,  9.30153728e-01, -3.91749442e-01,
        6.32270098e-01,  2.91940570e-01,  6.22446001e-01,  5.41503489e-01,
        4.00291502e-01,  5.99478722e-01,  8.12314451e-02,  8.11395943e-02,
        4.41665947e-01,  1.10141015e+00, -9.73566175e-01, -1.45525181e+00,
        5.64943314e-01, -2.38152429e-01,  1.41102523e-01, -7.86105156e-01,
        8.86056185e-01, -7.10839093e-01,  3.27000290e-01,  5.97459912e-01,
       -1.95455417e-01, -5.74278355e-01,  1.28140464e-01,  8.05716038e-01,
       -4.99953538e-01,  3.17015976e-01,  3.11146140e-01,  2.31078458e+00,
       -6.16076440e-02, -4.31919515e-01, -1.45730332e-01,  6.16032660e-01,
       -9.90341902e-01, -8.43507424e-02, -8.87243569e-01, -3.43032390e-01,
        1.29097724e+00, -6.21537507e-01,  5.30034304e-01,  2.00960255e+00,
       -3.07885379e-01, -3.07402015e-01, -2.99035162e-01,  1.58422029e+00,
        6.66770160e-01,  9.46137309e-01,  1.00991011e+00, -4.43267673e-01,
       -1.53554392e+00, -3.11964750e-01, -9.86443758e-02,  2.98534557e-02,
       -2.61718333e-01, -2.58908939e+00, -4.89793241e-01,  2.49610424e+00,
        5.08717000e-01,  6.50168598e-01,  3.08177888e-01,  4.27572727e-01,
        2.46583670e-01, -3.61945689e-01,  3.36508095e-01,  1.83386886e+00,
       -3.30805369e-02,  4.69550401e-01, -2.56731361e-01, -5.89862108e-01,
        1.82226884e+00,  4.07911837e-01, -5.95187187e-01,  8.44984949e-02,
        2.93200195e-01,  2.97689795e+00,  1.81511497e+00,  1.09478426e+00,
        6.32766724e-01, -3.60258728e-01, -1.17961836e+00, -6.41382575e-01,
       -1.11094289e-01, -3.20806690e-02,  6.67168975e-01, -3.80334437e-01,
        2.60429978e-01,  2.25518560e+00,  6.14036143e-01, -1.76677987e-01,
       -1.77043125e-01,  2.15038013e+00, -7.47117698e-01, -6.94940984e-01,
       -8.47300291e-01, -8.48803222e-02,  8.57784092e-01,  5.92671931e-01,
        8.59628022e-01, -2.30567765e+00, -7.14714766e-01,  1.16440272e+00,
       -1.58269092e-01,  1.78712055e-01,  6.23534322e-01,  5.20011425e-01,
        1.34862280e+00,  4.30682451e-01,  1.31071520e+00,  7.05770731e-01,
        8.79335642e-01,  1.04244995e+00, -4.27514404e-01,  1.63319424e-01,
       -1.31811261e+00,  3.38772506e-01,  6.16423368e-01,  9.51805353e-01,
       -4.15812321e-02,  1.17271984e+00, -1.08082747e+00, -3.87872487e-01,
       -6.65958166e-01, -8.24503243e-01, -1.04688489e+00,  3.82272780e-01,
       -4.10087854e-01,  6.21690869e-01, -3.57830405e-01, -2.88853496e-01,
       -3.67524803e-01, -6.23959303e-01, -1.07978083e-01, -7.72217512e-01,
       -3.48409325e-01,  5.99236190e-01, -3.06323647e-01,  6.41631603e-01,
        1.45477855e+00, -2.87821621e-01,  1.15087187e+00, -8.66089165e-01,
        1.25644231e+00,  1.78619817e-01, -3.97542268e-01,  1.23447955e+00,
        3.76234233e-01,  2.45014504e-01,  1.80363189e-02, -1.10603869e+00,
        8.92204285e-01,  6.63499773e-01,  1.25622559e+00, -1.29876032e-01,
       -5.97272754e-01, -3.75236988e-01,  4.66275960e-01,  2.05121726e-01,
        8.55793536e-01, -4.02110481e+00,  1.44663119e+00,  2.47043297e-01,
        6.41006827e-01, -2.76829362e-01,  9.26169932e-01,  4.13330823e-01,
        3.42508614e-01,  1.37350821e+00, -1.03768575e+00,  2.95568883e-01,
        7.84581363e-01, -6.02763034e-02, -1.78127453e-01, -8.99840117e-01,
        1.44706023e+00,  1.71805096e+00,  1.68698475e-01, -5.69088042e-01,
       -1.05945960e-01, -2.80161768e-01, -6.83287263e-01, -1.20706224e+00,
       -6.09945178e-01, -1.30536163e+00, -2.01481447e-01, -2.13613780e-03,
        8.87428403e-01, -7.25608051e-01,  1.03411388e+00,  6.20733440e-01,
       -2.92526424e-01, -7.22831935e-02,  2.99865872e-01, -9.95002985e-01,
        9.87491369e-01,  2.62370020e-01,  3.95637065e-01, -8.62838387e-01,
       -3.04033637e-01,  3.50857317e-01, -4.46875505e-02,  9.52754319e-02,
       -1.81237090e+00,  1.28286946e+00, -1.01697290e+00,  3.33020598e-01,
        5.44873066e-02,  8.81249309e-02, -8.30183446e-01, -1.25741291e+00,
        1.13769807e-01,  4.17068511e-01, -3.84530395e-01, -2.24328697e-01,
        1.64285719e-01,  2.24695787e-01, -1.52362788e+00, -1.03049445e+00,
        5.07936776e-01,  6.00863695e-01,  6.78084612e-01,  5.73505223e-01,
        9.01764214e-01,  2.14455634e-01, -8.64829645e-02,  5.21133542e-01,
       -3.84391618e+00, -4.32377815e-01,  3.86084586e-01, -5.74579954e-01,
       -6.64480180e-02,  5.68239272e-01,  1.40972987e-01,  4.91561055e-01,
       -1.59076011e+00,  4.58789200e-01, -3.81277895e+00, -3.56776267e-01,
       -7.03040540e-01,  4.01638448e-01, -4.14998889e-01,  9.21632528e-01,
       -2.88249195e-01,  5.43611228e-01, -1.87583610e-01,  1.77437171e-01,
        3.09019029e-01, -8.21537554e-01,  9.64059532e-01, -4.18440819e-01,
       -3.74807954e-01, -7.94256628e-01,  1.79041013e-01,  1.07198226e+00,
        7.37931907e-01, -5.71660399e-01,  4.73279476e-01, -1.12489057e+00,
       -8.20027113e-01,  1.16821051e+00,  1.13403749e+00, -8.37848783e-01,
       -6.94696903e-01, -5.87071300e-01, -9.37979996e-01,  2.85793412e-02,
       -1.03942998e-01,  8.15052748e-01,  2.77216405e-01,  1.07841063e+00,
        8.49105597e-01, -1.22114134e+00,  1.31575480e-01,  4.65513058e-02,
       -9.81023669e-01, -1.33689547e+00,  9.66923177e-01, -1.29471350e+00,
        1.38094172e-01,  6.99724734e-01, -3.32960755e-01,  1.33537698e+00,
       -3.91558230e-01,  5.63477397e-01, -2.24221781e-01, -2.44365379e-01,
       -1.46850929e-01, -1.11731577e+00, -6.03061169e-02, -4.82764870e-01,
        9.59276438e-01,  2.19666290e+00, -9.22762871e-01,  3.80298674e-01,
        3.07932347e-01,  1.24999022e+00,  3.42894405e-01,  9.13683604e-03,
       -2.37314433e-01, -5.78179881e-02, -1.28482133e-01,  8.15542936e-01,
        4.25670408e-02,  1.63982916e+00, -2.73039818e-01, -1.85594022e-01,
       -8.31672192e-01,  9.44418490e-01, -8.35879385e-01,  8.57529119e-02,
       -2.79763341e-01,  9.06917334e-01,  3.24907333e-01,  3.79474759e-02,
        1.75888743e-02, -3.66290390e-01, -5.95124885e-02,  6.87758029e-01,
        5.07672548e-01,  8.89262438e-01,  1.69256181e-01, -2.01039672e-01,
        5.85848987e-01,  1.67553282e+00,  4.36515212e-01, -3.62415373e-01,
        8.22778121e-02,  4.75326419e-01,  1.03972077e+00,  1.17408001e+00,
        1.08113861e+00,  1.17782259e+00,  7.19923496e-01, -6.69488251e-01,
        1.36961073e-01,  1.56454515e+00, -4.26551163e-01, -9.29266691e-01,
        7.18483865e-01,  3.78707618e-01,  7.51538515e-01,  9.56296325e-02,
        2.21472085e-01,  2.28069518e-02,  5.18463790e-01,  1.02798796e+00,
        2.11316586e-01,  1.61214125e+00,  7.54863799e-01, -9.27332759e-01,
       -1.86410949e-01, -1.12251091e+00,  3.68848145e-01,  4.18064177e-01,
        7.76560783e-01,  2.79443599e-02, -3.13859940e-01, -4.47741836e-01,
        4.84260559e-01,  1.09082341e+00,  1.55656290e+00, -3.40694696e-01,
        4.41435456e-01,  7.73885310e-01,  6.59892440e-01, -1.66957736e+00,
       -1.25799298e+00,  7.65725374e-01,  7.07073510e-01, -1.03319263e+00,
        5.52317381e-01,  3.94287586e-01,  3.92046362e-01, -1.49054527e-01,
       -5.83782732e-01, -5.09692848e-01,  5.62452197e-01,  6.57410741e-01,
       -5.00763655e-01,  4.95452195e-01,  3.58396247e-02, -7.79511034e-01,
        4.42655563e-01, -2.28039667e-01,  8.00651312e-02,  1.79112375e+00,
        3.67924958e-01,  1.30259705e+00,  1.01184118e+00, -8.26271296e-01,
       -1.88019410e-01,  1.58395958e+00, -5.49807787e-01,  5.25783479e-01,
       -8.63637924e-01, -5.48161387e-01,  4.79667455e-01,  4.72317845e-01,
       -3.08872998e-01,  8.75118449e-02, -8.59509230e-01,  7.50889957e-01,
       -2.64737934e-01,  7.62561083e-01, -4.49841231e-01,  1.45644456e-01,
        1.01997578e+00,  3.77051145e-01, -2.72834718e-01,  4.99737054e-01,
       -1.25988948e+00, -5.44752657e-01,  2.32418612e-01,  9.05423999e-01,
       -5.48643887e-01,  7.33481586e-01, -5.10832906e-01,  1.30848140e-01,
        7.81549454e-01, -4.22700197e-01, -3.84838343e-01,  1.11054122e+00,
       -9.99024928e-01,  1.36228776e+00, -6.92071795e-01, -3.62365395e-01,
       -2.13862792e-01,  7.48088360e-01, -6.61671400e-01,  3.47315609e-01],
      dtype=float32)
  • feature_extractor(text, return_tensors="pt": 입력 텍스트를 특징 추출 파이프라인으로 전달하고 결과를 PyTorch 텐서로 반환하도록 지정합니다. [0]: 입력 텍스트의 특징 표현에 해당하는 출력의 첫 번째 요소를 선택합니다.

  • .numpy(): 파이토치 텐서를 NumPy 배열로 변환합니다.

  • .mean(axis=0): 시퀀스 차원(축 0)에 걸쳐 특징 벡터의 평균을 계산합니다. 이는 모든 토큰의 특징을 768차원의 단일 벡터로 집계하는 방법입니다(BART 베이스의 숨겨진 크기가 768이므로).

Last updated