ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • PyTorch "ShortFormer" - RoBERTa w/Chunks(kaggle study__2)
    자율주행스터디 2022. 1. 21. 11:29

     

    트랜스포머 모델은 훌륭합니다. 우리 모두는 그들을 사랑합니다. 
    그러나 Transformer 아키텍처의 핵심인 self-attention 메커니즘에는
    메모리 측면에서 (적어도) 입력 시퀀스 길이에 따라 2차적으로 확장되는 행렬 곱셈이 있습니다. 
    이 연산은 비용이 많이 듭니다. 그리고 이것은 바닐라 트랜스포머 를 긴 시퀀스에서 금지하게 만듭니다.
    이것은 우리가 지속적으로 보고 사용하는 BERT와 같은 모델에서 최대 512개의 토큰으로 이어집니다.

     

    import os
    import gc
    import time
    from tqdm import tqdm
    from collections import defaultdict
    
    import numpy as np
    import pandas as pd
    from sklearn.metrics import accuracy_score
    
    import torch
    from torch.utils.data import Dataset, DataLoader
    
    from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification
    
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'

     

    # This is used to download the model from the huggingface hub
    MODEL_NAME = 'roberta-base'
    
    # Path where to download the model
    MODEL_PATH = 'model'
    
    
    # Max length for the tokenization and the model
    # For BERT-like models it's 512 in general
    MAX_LENGTH = 512
    
    # The overlapping tokens when chunking the texts
    # Possibly a power of 2 would have been better
    # Tried with 386 and didn't improve
    DOC_STRIDE = 200
    
    # Training configuration
    # 5 epochs with different learning rates (inherited from Chris')
    # Haven't tried variations yet
    config = {'train_batch_size': 4,
              'valid_batch_size': 4,
              'epochs': 5,
              'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
              'max_grad_norm': 10,
              'device': 'cuda' if torch.cuda.is_available() else 'cpu'}

     

     

    df_all = pd.read_csv('../input/feedback-prize-2021/train.csv')
    print(df_all.shape)
    display(df_all.head())

     

    # https://www.kaggle.com/raghavendrakotala/fine-tunned-on-roberta-base-as-ner-problem-0-533
    train_names, train_texts = [], []
    for f in tqdm(list(os.listdir('../input/feedback-prize-2021/train'))):
        train_names.append(f.replace('.txt', ''))
        train_texts.append(open('../input/feedback-prize-2021/train/' + f, 'r').read())
    
        df_texts = pd.DataFrame({'id': train_names, 'text': train_texts})
    
    df_texts['text_split'] = df_texts.text.str.split()
    df_texts.head()

     

    **iterrows()는 데이터 프레임의 행을 반복하며 행 자체를 포함하는 객체에 덧붙여 각 행의 색인을 반환하는 제너레이터다. ... 예제의 경우 iterrows()는 행을 수동으로 반복하는 것보다 거의 똑같은 문제를 약 4배 빠르게 해결한다.**

    DataFrame.iterrows() 메소드는 결과물로 (index, Series) 짝(pairs)을 반환합니다.

    # https://www.kaggle.com/cdeotte/pytorch-bigbird-ner-cv-0-615
    all_entities = []
    for _, row in tqdm(df_texts.iterrows(), total=len(df_texts)):
        
        total = len(row['text_split'])
        entities = ["O"] * total
        
        for _, row2 in df_all[df_all['id'] == row['id']].iterrows():
            discourse = row2['discourse_type']
            list_ix = [int(x) for x in row2['predictionstring'].split(' ')]
            entities[list_ix[0]] = f"B-{discourse}"
            for k in list_ix[1:]: entities[k] = f"I-{discourse}"
        all_entities.append(entities)
    
    df_texts['entities'] = all_entities
    df_texts.to_csv('train_NER.csv',index=False)
    
        
    print(df_texts.shape)
    df_texts.head()

    **len 취하면 행 개수 나오므로

    # Check that we have created one entity/label for each word correctly
    (df_texts['text_split'].str.len() == df_texts['entities'].str.len()).all()
    # Create global dictionaries to use during training and inference
    
    # https://www.kaggle.com/cdeotte/pytorch-bigbird-ner-cv-0-615
    output_labels = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim', 
              'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']
    
    LABELS_TO_IDS = {v:k for k,v in enumerate(output_labels)}
    IDS_TO_LABELS = {k:v for k,v in enumerate(output_labels)}
    
    LABELS_TO_IDS

     

    id 에서 유일한 값 찾기 

    # CHOOSE VALIDATION INDEXES
    IDS = df_all.id.unique()
    print(f'There are {len(IDS)} train texts. We will split 90% 10% for validation.')
    
    # TRAIN VALID SPLIT 90% 10%
    np.random.seed(42)
    train_idx = np.random.choice(np.arange(len(IDS)),int(0.9*len(IDS)),replace=False)
    valid_idx = np.setdiff1d(np.arange(len(IDS)),train_idx)
    np.random.seed(None)
    
    # CREATE TRAIN SUBSET AND VALID SUBSET
    df_train = df_texts.loc[df_texts['id'].isin(IDS[train_idx])].reset_index(drop=True)
    df_val = df_texts.loc[df_texts['id'].isin(IDS[valid_idx])].reset_index(drop=True)
    
    print(f"FULL Dataset : {df_texts.shape}")
    print(f"TRAIN Dataset: {df_train.shape}")
    print(f"TEST Dataset : {df_val.shape}")

    **backbone 이란 자동모델을 만들어줌

    def download_model():
        # https://www.kaggle.com/cdeotte/pytorch-bigbird-ner-cv-0-615
        os.mkdir(MODEL_PATH)
        
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)
        tokenizer.save_pretrained(MODEL_PATH)
    
        config_model = AutoConfig.from_pretrained(MODEL_NAME) 
        config_model.num_labels = 15
        config_model.save_pretrained(MODEL_PATH)
    
        backbone = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, 
                                                                   config=config_model)
        backbone.save_pretrained(MODEL_PATH)
        print(f"Model downloaded to {MODEL_PATH}/")
        
    download_model()

     

Designed by Tistory.