nlpaugmentation

카테고리 없음 2022. 1. 21. 12:40

# Install and import nlpaug package.
!cp -r ../input/nlpaug-from-github/nlpaug-master ./
!pip install nlpaug-master/
!rm -r nlpaug-master

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.word.context_word_embs as nawcwe
import nlpaug.augmenter.word.word_embs as nawwe
import nlpaug.augmenter.word.spelling as naws

from colorama import Fore
from pathlib import Path

# Set some seeds. We want randomness for real usage, but for this tutorial, determinism helps explain some examples.
import numpy as np
np.random.seed(1000)
import random
random.seed(1000)

base_path = Path("../input/feedback-prize-2021/train")

with open(base_path / "3FF2F530D590.txt") as f:
    sample = f.read()
    
print(f"Original: {len(sample)}\n{sample}")

aug = nac.KeyboardAug()
augmented_text = aug.augment(sample)
print(f"\nKeyboard augmentation: {len(augmented_text)}\n{augmented_text}")

# Replace orig text, with a version that has extra whitespace removed.
sample = " ".join([x.strip() for x in sample.split()])

def print_and_highlight_diff(orig_text, new_texts):
    """ A simple diff viewer for augmented texts. """
    orig_split = orig_text.split()
    print(f"Original: {len(orig_split)}\n{orig_text}\n")
    for new_text in new_texts:
        print(f"Augmented: {len(new_text.split())}")
        for i, word in enumerate(new_text.split()):
            if i < len(orig_split) and word == orig_split[i]:
                print(word, end=" ")
            else:
                print(Fore.RED + word + Fore.RESET, end=" ")
        print()

word 마다 한개의 캐릭터를 증강

aug = nac.KeyboardAug(include_numeric=False, include_special_char=False, aug_char_max=1, aug_word_p=0.05)
augmented_texts = aug.augment(sample, n=3)
augmented_texts = [x.replace(" ' ", "'") for x in augmented_texts]
print_and_highlight_diff(sample, augmented_texts)

스펠링 오류가 심각

aug = naw.SpellingAug()
augmented_texts = aug.augment(sample, n=3)
augmented_texts = [x.replace(" ' ", "'") for x in augmented_texts]
print_and_highlight_diff(sample, augmented_texts)

이것은 미스스펠링 db를 이용해 자연스럽게해줌

aug = naw.SynonymAug()
augmented_texts = aug.augment(sample, n=3)
augmented_texts = [x.replace(" ' ", "'") for x in augmented_texts]
print_and_highlight_diff(sample, augmented_texts)

워드를 동의어로 바꿔줌

aug = nawwe.WordEmbsAug(model_type='glove', model_path='../input/glove-embeddings/glove.6B.300d.txt')
augmented_texts = aug.augment(sample, n=3)
augmented_texts = [x.replace(" ' ", "'") for x in augmented_texts]
print_and_highlight_diff(sample, augmented_texts)

비슷한 단어를 워드임베딩을 이용해 증강해줌

aug = nawcwe.ContextualWordEmbsAug(model_path='../input/huggingface-bert-variants/bert-base-cased/bert-base-cased')
augmented_texts = aug.augment(sample, n=3)
augmented_texts = [x.replace(" ' ", "'") for x in augmented_texts]
print_and_highlight_diff(sample, augmented_texts)

문맥 이용해 임베딩, BERT 이용

저작자표시 비영리 변경금지 (새창열림)

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

인기포스트

ABOUT ME

파이토치 파이토치

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역