Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def kkma_instance():
from konlpy.tag import Kkma
k = Kkma()
return k
def tkorean_instance():
from konlpy.tag import Okt
t = Okt()
return t
def hannanum_instance():
from konlpy import init_jvm
from konlpy.tag import Hannanum
init_jvm()
h = Hannanum()
return h
def komoran_instance():
from konlpy.tag import Komoran
k = Komoran()
return k
def test_daum_streamer():
daum = DaumStreamer()
daum.options.n_limits = 1
daum.options.display_rank = True
daum.options.verbose = True
daum.options.interval = 3
daum.stream()
def test_naver_streamer():
naver = NaverStreamer()
naver.options.n_limits = 1
naver.options.display_rank = True
naver.options.verbose = True
naver.options.interval = 3
naver.stream()
def test_corpus_kolaw():
from konlpy.corpus import kolaw
fids = kolaw.fileids()
kolaw.abspath()
kolaw.abspath(fids[0])
assert kolaw.name == 'kolaw'
assert kolaw.open('constitution.txt').read(10) ==\
u'\ub300\ud55c\ubbfc\uad6d\ud5cc\ubc95\n\n\uc720\uad6c'
def _conv_type_b(self, idx):
"""
:return:
"""
df_csv_read = pd.read_csv(self.pattern_data_path,
skipinitialspace=True,
engine="python",
encoding='utf-8-sig')
i = 0
for key, line in zip(df_csv_read['decode'].values, df_csv_read['encode'].values) :
words = []
if (self.use_mecab):
self.mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
pos = self.mecab.pos(line)
for word, tag in pos:
words.append(word)
else:
words = str(line).split(' ')
match_keys = self._check_all_match(words)
aug_data = self._aug_sent(match_keys, words, [])
self._intent_formatter(aug_data, key, idx)
if(i%100 == 0) :
print("====Therad{0} : {1} line job done".format(idx, i))
i = i + 1
filename = "{}{}{}.{}".format(
self.dirname,
self.options.output_prefix,
word_count,
self.options.output_extension
)
n_word_file = io.open(filename, 'a', encoding='utf-8')
n_word_file.write(tweet)
n_word_file.write("\n")
if self.options.verbose:
for word in self.words:
tweet = (colorama.Fore.CYAN + word).join(tweet.split(word))
tweet = (word + colorama.Fore.RESET).join(tweet.split(word))
pprint(word_count, tweet)
import tensorflow as tf
import numpy as np
import codecs
os.chdir("C:\\Users\\jbk48\\Desktop\\Sentimental-Analysis-master\\Sentimental-Analysis-master\\Word2Vec\\Movie_rating_data")
def read_data(filename):
with open(filename, 'r',encoding='utf-8') as f:
data = [line.split('\t') for line in f.read().splitlines()]
data = data[1:] # header 제외 #
return data
train_data = read_data('ratings_train.txt')
test_data = read_data('ratings_test.txt')
pos_tagger = Twitter()
def tokenize(doc):
return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
## training Word2Vec model using skip-gram
tokens = [tokenize(row[1]) for row in train_data]
model = gensim.models.Word2Vec(size=300,sg = 1, alpha=0.025,min_alpha=0.025, seed=1234)
model.build_vocab(tokens)
for epoch in range(30):
model.train(tokens,model.corpus_count,epochs = model.iter)
model.alpha -= 0.002
model.min_alpha = model.alpha