-
nlpaugmentation카테고리 없음 2022. 1. 21. 12:40
# Install and import nlpaug package. !cp -r ../input/nlpaug-from-github/nlpaug-master ./ !pip install nlpaug-master/ !rm -r nlpaug-master import nlpaug.augmenter.char as nac import nlpaug.augmenter.word as naw import nlpaug.augmenter.word.context_word_embs as nawcwe import nlpaug.augmenter.word.word_embs as nawwe import nlpaug.augmenter.word.spelling as naws
from colorama import Fore from pathlib import Path # Set some seeds. We want randomness for real usage, but for this tutorial, determinism helps explain some examples. import numpy as np np.random.seed(1000) import random random.seed(1000) base_path = Path("../input/feedback-prize-2021/train") with open(base_path / "3FF2F530D590.txt") as f: sample = f.read() print(f"Original: {len(sample)}\n{sample}") aug = nac.KeyboardAug() augmented_text = aug.augment(sample) print(f"\nKeyboard augmentation: {len(augmented_text)}\n{augmented_text}")
# Replace orig text, with a version that has extra whitespace removed. sample = " ".join([x.strip() for x in sample.split()])
def print_and_highlight_diff(orig_text, new_texts): """ A simple diff viewer for augmented texts. """ orig_split = orig_text.split() print(f"Original: {len(orig_split)}\n{orig_text}\n") for new_text in new_texts: print(f"Augmented: {len(new_text.split())}") for i, word in enumerate(new_text.split()): if i < len(orig_split) and word == orig_split[i]: print(word, end=" ") else: print(Fore.RED + word + Fore.RESET, end=" ") print()
word 마다 한개의 캐릭터를 증강
aug = nac.KeyboardAug(include_numeric=False, include_special_char=False, aug_char_max=1, aug_word_p=0.05) augmented_texts = aug.augment(sample, n=3) augmented_texts = [x.replace(" ' ", "'") for x in augmented_texts] print_and_highlight_diff(sample, augmented_texts)
스펠링 오류가 심각
aug = naw.SpellingAug() augmented_texts = aug.augment(sample, n=3) augmented_texts = [x.replace(" ' ", "'") for x in augmented_texts] print_and_highlight_diff(sample, augmented_texts)
이것은 미스스펠링 db를 이용해 자연스럽게해줌
aug = naw.SynonymAug() augmented_texts = aug.augment(sample, n=3) augmented_texts = [x.replace(" ' ", "'") for x in augmented_texts] print_and_highlight_diff(sample, augmented_texts)
워드를 동의어로 바꿔줌
aug = nawwe.WordEmbsAug(model_type='glove', model_path='../input/glove-embeddings/glove.6B.300d.txt') augmented_texts = aug.augment(sample, n=3) augmented_texts = [x.replace(" ' ", "'") for x in augmented_texts] print_and_highlight_diff(sample, augmented_texts)
비슷한 단어를 워드임베딩을 이용해 증강해줌
aug = nawcwe.ContextualWordEmbsAug(model_path='../input/huggingface-bert-variants/bert-base-cased/bert-base-cased') augmented_texts = aug.augment(sample, n=3) augmented_texts = [x.replace(" ' ", "'") for x in augmented_texts] print_and_highlight_diff(sample, augmented_texts)
문맥 이용해 임베딩, BERT 이용