Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
lengths = [self.len_of_sent(i) for i in corpus]
if len(corpus) < 32:
n_buckets = 1
else:
n_buckets = min(self.config.n_buckets, len(corpus))
buckets = dict(zip(*kmeans(lengths, n_buckets)))
sizes, buckets = zip(*[
(size, bucket) for size, bucket in buckets.items()
])
# the number of chunks in each bucket, which is clipped by
# range [1, len(bucket)]
chunks = [min(len(bucket), max(round(size * len(bucket) / batch_size), 1)) for size, bucket in
zip(sizes, buckets)]
range_fn = randperm if shuffle else arange
max_samples_per_batch = self.config.get('max_samples_per_batch', None)
for i in tolist(range_fn(len(buckets))):
split_sizes = [(len(buckets[i]) - j - 1) // chunks[i] + 1
for j in range(chunks[i])] # how many sentences in each batch
for batch_indices in tf.split(range_fn(len(buckets[i])), split_sizes):
indices = [buckets[i][j] for j in tolist(batch_indices)]
if max_samples_per_batch:
for j in range(0, len(indices), max_samples_per_batch):
yield from self.batched_inputs_to_batches(corpus, indices[j:j + max_samples_per_batch],
shuffle)
else:
yield from self.batched_inputs_to_batches(corpus, indices, shuffle)
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-06 20:23
import hanlp
from hanlp.common.component import KerasComponent
tagger: KerasComponent = hanlp.load(hanlp.pretrained.pos.CTB5_POS_RNN)
print(tagger('商品 和 服务'.split()))
tagger.serve()
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:22
from hanlp.components.tok import TransformerTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST
from tests import cdroot
cdroot()
tokenizer = TransformerTokenizer()
save_dir = 'data/model/cws_bert_base_ctb6'
tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_VALID, save_dir, transformer='bert-base-chinese',
metrics='f1')
# tokenizer.load(save_dir)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')
# Author: hankcs
# Date: 2019-12-28 22:22
from hanlp.components.tok import TransformerTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST
from tests import cdroot
cdroot()
tokenizer = TransformerTokenizer()
save_dir = 'data/model/cws_bert_albert_ctb6'
tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_VALID, save_dir,
transformer='/home/ubuntu/hankcs/laser/data/transformer/albert_base_tf2',
metrics='f1', learning_rate=5e-5, epochs=3)
tokenizer.load(save_dir)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:39
from hanlp.components.tok import TransformerTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_VALID, CTB6_CWS_TEST
from tests import cdroot
cdroot()
tokenizer = TransformerTokenizer()
save_dir = 'data/model/cws_bert_base_100million'
tokenizer.fit('data/cws/large/all.txt', CTB6_CWS_VALID, save_dir, transformer='bert-base-chinese',
metrics='accuracy', batch_size=32)
tokenizer.load(save_dir, metrics='f1')
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
epsilon=1e-8, clipnorm=5)
tokenizer.fit(CTB6_CWS_TRAIN,
CTB6_CWS_VALID,
save_dir,
word_embed={'class_name': 'HanLP>Word2VecEmbedding',
'config': {
'trainable': True,
'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
'expand_vocab': False,
'lowercase': False,
}},
optimizer=optimizer,
window_size=0,
weight_norm=True)
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
print(f'Model saved in {save_dir}')
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
epsilon=1e-8, clipnorm=5)
tokenizer.fit(CTB6_CWS_TRAIN,
CTB6_CWS_VALID,
save_dir,
word_embed={'class_name': 'HanLP>Word2VecEmbedding',
'config': {
'trainable': True,
'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
'expand_vocab': False,
'lowercase': False,
}},
optimizer=optimizer,
window_size=0,
weight_norm=True)
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
print(f'Model saved in {save_dir}')
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:39
from hanlp.components.tok import TransformerTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_VALID, CTB6_CWS_TEST
from tests import cdroot
cdroot()
tokenizer = TransformerTokenizer()
save_dir = 'data/model/cws_bert_base_100million'
tokenizer.fit('data/cws/large/all.txt', CTB6_CWS_VALID, save_dir, transformer='bert-base-chinese',
metrics='accuracy', batch_size=32)
tokenizer.load(save_dir, metrics='f1')
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:22
from hanlp.components.tok import TransformerTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST
from tests import cdroot
cdroot()
tokenizer = TransformerTokenizer()
save_dir = 'data/model/cws_bert_base_ctb6'
tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_VALID, save_dir, transformer='bert-base-chinese',
metrics='f1')
# tokenizer.load(save_dir)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')
# Date: 2019-12-29 21:58
import tensorflow as tf
from hanlp.components.tok import NgramConvTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST
from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR
from tests import cdroot
cdroot()
tokenizer = NgramConvTokenizer()
save_dir = 'data/model/cws/ctb6_cws'
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
epsilon=1e-8, clipnorm=5)
tokenizer.fit(CTB6_CWS_TRAIN,
CTB6_CWS_VALID,
save_dir,
word_embed={'class_name': 'HanLP>Word2VecEmbedding',
'config': {
'trainable': True,
'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
'expand_vocab': False,
'lowercase': False,
}},
optimizer=optimizer,
window_size=0,
weight_norm=True)
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
print(f'Model saved in {save_dir}')