Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
#coding:utf-8
import paddlehub as hub
simnet_bow = hub.Module(name="simnet_bow")
test_text_1 = ["这道题太难了", "这道题太难了", "这道题太难了"]
test_text_2 = ["这道题是上一年的考题", "这道题不简单", "这道题很有意思"]
inputs = {"text_1": test_text_1, "text_2": test_text_2}
results = simnet_bow.similarity(data=inputs)
max_score = -1
result_text = ""
for result in results:
if result['similarity'] > max_score:
max_score = result['similarity']
result_text = result['text_2']
print("The most matching with the %s is %s" % (test_text_1[0], result_text))
import paddle
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False")
parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch when the program predicts.")
args = parser.parse_args()
# yapf: enable.
if __name__ == '__main__':
# loading Paddlehub senta pretrained model
module = hub.Module(name="senta_bilstm")
inputs, outputs, program = module.context(trainable=True)
# Download dataset and use LACClassifyReader to read dataset
dataset = hub.dataset.ChnSentiCorp()
reader = hub.reader.LACClassifyReader(
dataset=dataset, vocab_path=module.get_vocab_path())
sent_feature = outputs["sentence_feature"]
# Setup feed list for data feeder
# Must feed all the tensor of senta's module need
feed_list = [inputs["words"].name]
# Setup runing config for PaddleHub Finetune API
config = hub.RunConfig(
use_data_parallel=False,
import paddle.fluid as fluid
import paddlehub as hub
if __name__ == "__main__":
resnet_module = hub.Module(module_dir="ResNet50.hub_module")
input_dict, output_dict, program = resnet_module.context(trainable=True)
dataset = hub.dataset.Flowers()
data_reader = hub.reader.ImageClassificationReader(
image_width=resnet_module.get_excepted_image_width(),
image_height=resnet_module.get_excepted_image_height(),
images_mean=resnet_module.get_pretrained_images_mean(),
images_std=resnet_module.get_pretrained_images_std(),
dataset=dataset)
with fluid.program_guard(program):
label = fluid.layers.data(name="label", dtype="int64", shape=[1])
img = input_dict[0]
feature_map = output_dict[0]
config = hub.RunConfig(
use_cuda=True,
num_epoch=10,
def __init__(self, vocab_path, dataset=None, in_tokens=False):
self.dataset = dataset
self.lac = hub.Module(name="lac")
self.tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=False)
self.vocab = self.tokenizer.vocab
self.feed_key = list(
self.lac.processor.data_format(
sign_name="lexical_analysis").keys())[0]
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
self.in_tokens = in_tokens
parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False")
parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.")
parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup proportion params for warmup strategy")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.")
args = parser.parse_args()
# yapf: enable.
if __name__ == '__main__':
# Load Paddlehub ERNIE Tiny pretrained model
module = hub.Module(name="ernie_tiny")
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use accuracy as metrics
# Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC
# metric should be acc, f1 or matthews
dataset = hub.dataset.ChnSentiCorp()
metrics_choices = ["acc"]
# For ernie_tiny, it use sub-word to tokenize chinese sentence
# If not ernie tiny, sp_model_path and word_dict_path should be set None
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
hub.common.logger.logger.setLevel("INFO")
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=1, help="Number of epoches for fine-tuning.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint.")
parser.add_argument("--max_seq_len", type=int, default=384, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=8, help="Total examples' number in batch for training.")
args = parser.parse_args()
# yapf: enable.
if __name__ == '__main__':
# Load Paddlehub BERT pretrained model
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use ReadingComprehensionReader to read dataset
# If you wanna load SQuAD 2.0 dataset, just set version_2_with_negative as True
dataset = hub.dataset.SQUAD(version_2_with_negative=False)
# dataset = hub.dataset.SQUAD(version_2_with_negative=True)
reader = hub.reader.ReadingComprehensionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
doc_stride=128,
max_query_length=64)
# Use "sequence_output" for token-level output.
import paddlehub as hub
import pandas as pd
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.")
parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False")
args = parser.parse_args()
# yapf: enable.
if __name__ == '__main__':
# Load Paddlehub ERNIE 2.0 pretrained model
module = hub.Module(name="ernie_v2_eng_base")
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use MultiLabelReader to read dataset
dataset = hub.dataset.Toxic()
reader = hub.reader.MultiLabelClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
args = parser.parse_args()
# yapf: enable.
if __name__ == '__main__':
# Load Paddlehub senta pretrained model
module = hub.Module(name="senta_bilstm")
inputs, outputs, program = module.context(trainable=True)
# Download dataset and use LACClassifyReader to read dataset
dataset = hub.dataset.ChnSentiCorp()
reader = hub.reader.LACClassifyReader(
dataset=dataset, vocab_path=module.get_vocab_path())
sent_feature = outputs["sentence_feature"]
# Setup feed list for data feeder
# Must feed all the tensor of senta's module need
feed_list = [inputs["words"].name]
# Setup runing config for PaddleHub Finetune API
config = hub.RunConfig(
use_cuda=args.use_gpu,
def infer_with_input_file():
# get senta module
senta = hub.Module(name="senta")
# get the input keys for signature 'sentiment_classify'
data_format = senta.processor.data_format(sign_name='sentiment_classify')
key = list(data_format.keys())[0]
# parse input file
test_file = os.path.join("test", "test.txt")
test_text = hub.io.parser.txt_parser.parse(test_file)
# set input dict
input_dict = {key: test_text}
results = senta.sentiment_classify(data=input_dict)
for index, result in enumerate(results):
hub.logger.info("sentence %d segmented result: %s" %
(index + 1, result['sentiment_key']))