kaggle study1__rapids umap tfidf kmean

CodingPython 2022. 1. 20. 18:42

import pandas as pd, os
import cudf, cuml, cupy
from tqdm import tqdm
import numpy as np
print('RAPIDS',cudf.__version__)

# https://www.kaggle.com/raghavendrakotala/fine-tunned-on-roberta-base-as-ner-problem-0-533
train_names, train_texts = [], [] # 빈 리스트
for f in tqdm(list(os.listdir('../input/feedback-prize-2021/train'))):
train_names.append(f.replace('.txt', '')) # 빈 걸로 바꿈
train_texts.append(open('../input/feedback-prize-2021/train/' + f, 'r').read()) # read 전용인데 append 가 된다고?
train_text_df = cudf.DataFrame({'id': train_names, 'text': train_texts}) #cuda dataframe 으로 딕셔너리 만들기
train_text_df.head()

from cuml.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', binary=True, max_features=25_000)
text_embeddings = tfidf.fit_transform( train_text_df.text ).toarray() #텍스트만 추려내서 fit 한 다음 array 로 바꾸기 (임베딩 # 만드는 과정)

from cuml import UMAP
umap = UMAP()
embed_2d = umap.fit_transform(text_embeddings)#UMAP — 균일 한 매니 폴드 근사치 및 투영,UMAP 감속기를 사용하여 임베#딩을 만들 수 있습니다.

embed_2d = cupy.asnumpy( embed_2d ) #CPU와 GPU 간의 배열 이동: cupy.asarray()는 numpy.ndarray, list 또

#는 numpy.array()로 전달 될 수 있는 모든 객체를 현재 장치로 이동시키는데 사용할 수 있습니다.

from cuml import KMeans
kmeans = cuml.KMeans(n_clusters=15)
kmeans.fit(embed_2d)
train_text_df['cluster'] = kmeans.labels_

import matplotlib.pyplot as plt

centers = kmeans.cluster_centers_

plt.figure(figsize=(10,10))
plt.scatter(embed_2d[:,0],

embed_2d[:,1], # 관습적으로 사용함 .

s = 1,

c = kmeans.labels_)
plt.title('UMAP Plot of Train Text using Tfidf features\nRAPIDS Discovers the 15 essay topics!',size=16)

for k in range(len(centers)):
    mm = cupy.mean( text_embeddings[train_text_df.cluster.values==k],axis=0 )
    ii = cupy.argmax(mm) #색인에서 가장 큰 거
    top_word = tfidf.vocabulary_.iloc[ii] #그거를 행으로 가진 얘들 다 데려와!
    plt.text(centers[k,0]-1, centers[k,1]+0.75, f'{k+1}-{top_word}', size=16)

plt.show()

for k in range(15):
    mm = cupy.mean( text_embeddings[train_text_df.cluster.values==k],axis=0 ) #행에 대해서 mean
    ii = cupy.asnumpy( cupy.argsort(mm)[-5:][::-1] ) #끝까지 -1씩이동하라
    top_words = tfidf.vocabulary_.to_array()[ii]
    print('#'*25)
    print(f'### Essay Topic {k+1}')
    print('### Top 5 Words',top_words)
    print('#'*25)
    tmp = train_text_df.loc[train_text_df.cluster==k].sample(3, random_state=123)
    for j in range(3):
        txt = tmp.iloc[j,1]
        print('-'*10,f'Example {j+1}','-'*10)
        print(txt,'\n')

df1.loc[0] 은 '전체 데이터 프레임에서 인덱스 이름이 0인 행만 추출해줘'

df1.iloc[0] 는 '전체 데이터 프레임에서 0번째 행에 있는 값들만 추출해라

저작자표시 비영리 변경금지

'CodingPython' 카테고리의 다른 글

b for a in x (0)	2022.01.31
lambda,list (0)	2022.01.31
tqqq 이번년도 50프로 하락 가능성 존재하지만... (3)	2022.01.14
얼굴인식에 대한 단상 (0)	2022.01.12
make_pipeline 의 문제점? 다양한 모델 (0)	2022.01.09

ABOUT ME

파이토치 파이토치

'CodingPython' 카테고리의 다른 글

티스토리툴바

ABOUT ME

'CodingPython' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바