How to use the hanlp.datasets.cws.ctb.CTB6_CWS_VALID function in hanlp

To help you get started, we’ve selected a few hanlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github hankcs / HanLP / tests / train / zh / cws / train_large_bert_cws.py View on Github external
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:39
from hanlp.components.tok import TransformerTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_VALID, CTB6_CWS_TEST
from tests import cdroot

cdroot()
tokenizer = TransformerTokenizer()
save_dir = 'data/model/cws_bert_base_100million'
tokenizer.fit('data/cws/large/all.txt', CTB6_CWS_VALID, save_dir, transformer='bert-base-chinese',
              metrics='accuracy', batch_size=32)
tokenizer.load(save_dir, metrics='f1')
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')
github hankcs / HanLP / tests / train / zh / cws / train_ctb6_cws_bert.py View on Github external
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:22

from hanlp.components.tok import TransformerTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST
from tests import cdroot

cdroot()
tokenizer = TransformerTokenizer()
save_dir = 'data/model/cws_bert_base_ctb6'
tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_VALID, save_dir, transformer='bert-base-chinese',
              metrics='f1')
# tokenizer.load(save_dir)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')
github hankcs / HanLP / tests / train / zh / cws / train_large_conv_cws.py View on Github external
# Date: 2019-12-29 21:58

import tensorflow as tf

from hanlp.components.tok import NgramConvTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST
from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR
from tests import cdroot

cdroot()
tokenizer = NgramConvTokenizer()
save_dir = 'data/model/cws/ctb6_cws'
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
                                     epsilon=1e-8, clipnorm=5)
tokenizer.fit(CTB6_CWS_TRAIN,
              CTB6_CWS_VALID,
              save_dir,
              word_embed={'class_name': 'HanLP>Word2VecEmbedding',
                          'config': {
                              'trainable': True,
                              'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
                              'expand_vocab': False,
                              'lowercase': False,
                          }},
              optimizer=optimizer,
              window_size=0,
              weight_norm=True)
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
print(f'Model saved in {save_dir}')
github hankcs / HanLP / tests / train / zh / cws / train_large_rnn_cws.py View on Github external
# Date: 2019-12-21 15:39
import tensorflow as tf

from hanlp.components.tok import RNNTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_TEST, CTB6_CWS_VALID
from hanlp.pretrained.word2vec import RADICAL_CHAR_EMBEDDING_100, CONVSEG_W2V_NEWS_TENSITE_CHAR
from tests import cdroot

cdroot()

tokenizer = RNNTokenizer()
save_dir = 'data/model/cws/large_rnn_cws'
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
                                     epsilon=1e-8, clipnorm=5)
tokenizer.fit('data/cws/large/all.txt',
              CTB6_CWS_VALID,
              save_dir,
              embeddings={'class_name': 'HanLP>Word2VecEmbedding',
                          'config': {
                              'trainable': True,
                              'filepath': RADICAL_CHAR_EMBEDDING_100,
                              'expand_vocab': False,
                              'lowercase': False,
                          }},
              early_stopping_patience=5,
              batch_size=64,
              max_seq_len=64,
              metrics='accuracy'
              )
tokenizer.load(save_dir, metrics='f1')
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
github hankcs / HanLP / tests / train / zh / cws / train_ctb6_cws_albert.py View on Github external
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:22

from hanlp.components.tok import TransformerTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST
from tests import cdroot

cdroot()
tokenizer = TransformerTokenizer()
save_dir = 'data/model/cws_bert_albert_ctb6'
tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_VALID, save_dir,
              transformer='/home/ubuntu/hankcs/laser/data/transformer/albert_base_tf2',
              metrics='f1', learning_rate=5e-5, epochs=3)
tokenizer.load(save_dir)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')
github hankcs / HanLP / tests / train / zh / cws / train_ctb6_cws_convseg.py View on Github external
# Author: hankcs
# Date: 2019-12-28 22:22
import tensorflow as tf

from hanlp.components.tok import NgramConvTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST
from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR
from tests import cdroot

cdroot()
tokenizer = NgramConvTokenizer()
save_dir = 'data/model/cws/ctb6_cws'
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
                                     epsilon=1e-8, clipnorm=5)
tokenizer.fit(CTB6_CWS_TRAIN,
              CTB6_CWS_VALID,
              save_dir,
              word_embed={'class_name': 'HanLP>Word2VecEmbedding',
                          'config': {
                              'trainable': True,
                              'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
                              'expand_vocab': False,
                              'lowercase': False,
                          }},
              optimizer=optimizer,
              window_size=0,
              weight_norm=True)
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
print(f'Model saved in {save_dir}')