Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _conv_type_b(self, idx):
"""
:return:
"""
df_csv_read = pd.read_csv(self.pattern_data_path,
skipinitialspace=True,
engine="python",
encoding='utf-8-sig')
i = 0
for key, line in zip(df_csv_read['decode'].values, df_csv_read['encode'].values) :
words = []
if (self.use_mecab):
self.mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
pos = self.mecab.pos(line)
for word, tag in pos:
words.append(word)
else:
words = str(line).split(' ')
match_keys = self._check_all_match(words)
aug_data = self._aug_sent(match_keys, words, [])
self._intent_formatter(aug_data, key, idx)
if(i%100 == 0) :
print("====Therad{0} : {1} line job done".format(idx, i))
i = i + 1
def _pos_raw_data(self, lt):
"""
:param lt: list type value
:return:
"""
mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
return_arr= []
for raw in lt :
pos = mecab.pos(raw)
for word, tag in pos:
return_arr.append("{0}/{1}".format(word, tag))
return return_arr
def ko_lemmatize_nouns(inputString):
'''
Input: string (Korean)
Output: list of strings (Korean)
----------------------------------------------------------------------------
Returns list of nouns from the input.
'''
mecab = Mecab()
return mecab.nouns(inputString)
def __init__(self):
try:
from konlpy.tag import Mecab
except ImportError:
raise ImportError(
'Mecab is not installed. '
'You can install Mecab with "sh scripts/install_mecab.sh" '
'You can refer to the installation guide in https://github.com/lyeoni/prenlp/blob/master/scripts/install_mecab.sh or https://bitbucket.org/eunjeon/mecab-ko-dic/src')
self.tokenizer = Mecab()
def _mecab_parse(self, str_arr, tag_combine=True):
"""
:param h5file:
:return:
"""
mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
return_arr = []
for data in str_arr:
return_arr = return_arr + self._flat(mecab.pos(str(data)), tag_combine=tag_combine)
return return_arr
def __init__(self):
self.mecab = Mecab() # for annotation
self.table = parse_table()
self.cmu = cmudict.dict() # for English
self.rule2text = get_rule_id2text() # for comments of main rules
def _pos_raw_data(self, lt):
"""
:param lt: list type value
:return:
"""
mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
return_arr= []
for raw in lt :
pos = mecab.pos(raw)
for word, tag in pos:
return_arr.append("{0}/{1}".format(word, tag))
return return_arr
def _pos_tag_predict_data(self, x_input):
"""
:param x_input:
:return:
"""
word_list = []
mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
for word_tuple in self._pad_predict_input(mecab.pos(x_input)):
if (len(word_tuple[1]) > 0):
word = ''.join([word_tuple[0], "/", word_tuple[1]])
else:
word = word_tuple[0]
word_list.append(word)
return word_list
print('#{} =============='.format(iter_*config.batch_size + i))
print('Actu:\t{}\nPred:\t{}\n'.format(target_sentences, pred_sentences))
if __name__=='__main__':
config = argparser()
print(config)
# Load vocabulary
import pickle
with open(config.vocab, 'rb') as reader:
vocab = pickle.load(reader)
# Select tokenizer
if config.tokenizer=='mecab':
from konlpy.tag import Mecab
tokenizer = Tokenizer(tokenization_fn=Mecab().morphs,
vocab=vocab, max_seq_length=config.max_seq_len)
# Build dataloader
corpus = Corpus(corpus_path=config.corpus, tokenizer=tokenizer, model_type=config.model_type, cuda=config.cuda)
loader = DataLoader(dataset=corpus, batch_size=config.batch_size)
# Load model with trained parameters
if config.model_type=='LSTM':
model = LSTMLM(input_size=len(vocab),
embedding_size=config.embedding_size,
hidden_size=config.hidden_size,
output_size=len(vocab),
n_layers=config.n_layers,
dropout_p=config.dropout_p)
elif config.model_type=='BiLSTM':
model = BiLSTMLM(input_size=len(vocab),
def load_tokenizer(lang):
if lang=="ko":
from konlpy.tag import Mecab
tokenizer = Mecab()
elif lang=="ja":
import Mykytea
opt="-model jp-0.4.7-1.mod"
tokenizer = Mykytea.Mykytea(opt)
elif lang=="zh_cn":
import Mykytea
opt = "-model ctb-0.4.0-1.mod"
tokenizer = Mykytea.Mykytea(opt)
elif lang=="zh_tw":
import jieba
tokenizer = jieba
elif lang=="vi":
from pyvi import ViTokenizer
tokenizer = ViTokenizer
elif lang=="th":
from pythainlp.tokenize import word_tokenize