Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __init__(self, tagger, useful_tags, delimiters, min_token_length, stopwords, **kwargs):
if tagger == 'twitter':
self.tagger = taggers.Twitter()
self.tagger_options = {
'norm': bool(kwargs.get('norm', True)),
'stem': bool(kwargs.get('stem', True)),
}
elif tagger == 'komoran':
self.tagger = taggers.Komoran()
self.tagger_options = {
'flatten': bool(kwargs.get('flatten', True)),
}
elif tagger == 'hannanum':
self.tagger = taggers.Hannanum()
self.tagger_options = {
'ntags': int(kwargs.get('ntags', 9)),
'flatten': bool(kwargs.get('flatten', True)),
}
elif tagger == 'kkma':
'norm': bool(kwargs.get('norm', True)),
'stem': bool(kwargs.get('stem', True)),
}
elif tagger == 'komoran':
self.tagger = taggers.Komoran()
self.tagger_options = {
'flatten': bool(kwargs.get('flatten', True)),
}
elif tagger == 'hannanum':
self.tagger = taggers.Hannanum()
self.tagger_options = {
'ntags': int(kwargs.get('ntags', 9)),
'flatten': bool(kwargs.get('flatten', True)),
}
elif tagger == 'kkma':
self.tagger = taggers.Kkma()
self.tagger_options = {
'flatten': bool(kwargs.get('flatten', True)),
}
elif tagger == 'mecab':
self.tagger = taggers.Mecab()
self.tagger_options = {
'flatten': bool(kwargs.get('flatten', True)),
}
else:
raise LexRankError("available taggers are: twitter, komoran, hannanum, kkma, mecab")
self.useful_tags = useful_tags
self.delimiters = delimiters
self.stopwords = stopwords
self.min_token_length = min_token_length
self.splitter = self.splitterer()
self.pos = lambda text: self.tagger.pos(text, **self.tagger_options)
def __init__(self, tagger, useful_tags, delimiters, min_token_length, stopwords, **kwargs):
if tagger == 'twitter':
self.tagger = taggers.Twitter()
self.tagger_options = {
'norm': bool(kwargs.get('norm', True)),
'stem': bool(kwargs.get('stem', True)),
}
elif tagger == 'komoran':
self.tagger = taggers.Komoran()
self.tagger_options = {
'flatten': bool(kwargs.get('flatten', True)),
}
elif tagger == 'hannanum':
self.tagger = taggers.Hannanum()
self.tagger_options = {
'ntags': int(kwargs.get('ntags', 9)),
'flatten': bool(kwargs.get('flatten', True)),
}
elif tagger == 'kkma':
self.tagger = taggers.Kkma()
self.tagger_options = {
'flatten': bool(kwargs.get('flatten', True)),
}
elif tagger == 'mecab':
self.tagger = taggers.Mecab()
self.tagger_options = {
'flatten': bool(kwargs.get('flatten', True)),
}
else:
raise LexRankError("available taggers are: twitter, komoran, hannanum, kkma, mecab")
def __init__(self, tagger, useful_tags, delimiters, min_token_length, stopwords, **kwargs):
if tagger == 'twitter':
self.tagger = taggers.Twitter()
self.tagger_options = {
'norm': bool(kwargs.get('norm', True)),
'stem': bool(kwargs.get('stem', True)),
}
elif tagger == 'komoran':
self.tagger = taggers.Komoran()
self.tagger_options = {
'flatten': bool(kwargs.get('flatten', True)),
}
elif tagger == 'hannanum':
self.tagger = taggers.Hannanum()
self.tagger_options = {
'ntags': int(kwargs.get('ntags', 9)),
'flatten': bool(kwargs.get('flatten', True)),
}
elif tagger == 'kkma':
self.tagger = taggers.Kkma()
self.tagger_options = {
'flatten': bool(kwargs.get('flatten', True)),
}
elif tagger == 'mecab':
self.tagger = taggers.Mecab()
import numpy as np
import pandas as pd
import json
import re
import konlpy
from konlpy.tag import Kkma
from konlpy.utils import pprint
import konlpy.tag as tag
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
mecab = konlpy.tag.Mecab()
df = pd.read_json('data/Data_1_month.json')
import json, requests
from pandas.io.json import json_normalize
r = requests.get('http://rank.search.naver.com/rank.js')
json_normalize(json.loads(r.text), ['data', 'data'])
json.loads(r.text)['ts']
# test = df.loc[12649]['Contents']
df['Lemmatized'] = ""
for i in range(df.shape[0]):
Content = df.loc[i]['articleContents']
df.set_value(i, 'Lemmatized', ' '.join(mecab.nouns(Content)))
if i%2000 == 0:
self.tagger_options = {
'flatten': bool(kwargs.get('flatten', True)),
}
elif tagger == 'hannanum':
self.tagger = taggers.Hannanum()
self.tagger_options = {
'ntags': int(kwargs.get('ntags', 9)),
'flatten': bool(kwargs.get('flatten', True)),
}
elif tagger == 'kkma':
self.tagger = taggers.Kkma()
self.tagger_options = {
'flatten': bool(kwargs.get('flatten', True)),
}
elif tagger == 'mecab':
self.tagger = taggers.Mecab()
self.tagger_options = {
'flatten': bool(kwargs.get('flatten', True)),
}
else:
raise LexRankError("available taggers are: twitter, komoran, hannanum, kkma, mecab")
self.useful_tags = useful_tags
self.delimiters = delimiters
self.stopwords = stopwords
self.min_token_length = min_token_length
self.splitter = self.splitterer()
self.pos = lambda text: self.tagger.pos(text, **self.tagger_options)
temp = np.append(temp, '트럼프')
tfidf = TfidfVectorizer(max_features = 1000, lowercase=False)
temp2 = tfidf.fit_transform(temp)
Similarity = (temp2*temp2.T).A
Similarity[10591].argsort()[::-1][:10]
df['Contents'].as_matrix()[64162]
df['Contents'].as_matrix()[60000]
df['EventCount'] = df['Contents'].str.len()
df['EventCount'].max()
test
temp = tag.Kkma()
temp.pos(test)
tokens = temp.nouns(test)
tokens
ko = nltk.Text(tokens, name="실험용")
ko.common_contexts('전')
print(len(ko.tokens))
print(len(set(ko.tokens)))
ko.vocab()
kkma = Kkma()
df
Content = df.loc[1070]['Contents']
Content
kkma.nouns(Content)