How to use pyhanlp - 10 common examples

To help you get started, we’ve selected a few pyhanlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github hankcs / pyhanlp / tests / demos / demo_text_classification.py View on Github external
# -*- coding:utf-8 -*-
# Author:hankcs
# Date: 2018-05-23 17:26
import os

from pyhanlp import SafeJClass
from tests.test_utility import ensure_data

NaiveBayesClassifier = SafeJClass('com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')
sogou_corpus_path = ensure_data('搜狗文本分类语料库迷你版',
                                'http://file.hankcs.com/corpus/sogou-text-classification-corpus-mini.zip')


def train_or_load_classifier():
    model_path = sogou_corpus_path + '.ser'
    if os.path.isfile(model_path):
        return NaiveBayesClassifier(IOUtil.readObjectFrom(model_path))
    classifier = NaiveBayesClassifier()
    classifier.train(sogou_corpus_path)
    model = classifier.getModel()
    IOUtil.saveObjectTo(model, model_path)
    return NaiveBayesClassifier(model)


def predict(classifier, text):
github hankcs / pyhanlp / tests / demos / demo_text_classification.py View on Github external
# -*- coding:utf-8 -*-
# Author:hankcs
# Date: 2018-05-23 17:26
import os

from pyhanlp import SafeJClass
from tests.test_utility import ensure_data

NaiveBayesClassifier = SafeJClass('com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')
sogou_corpus_path = ensure_data('搜狗文本分类语料库迷你版',
                                'http://file.hankcs.com/corpus/sogou-text-classification-corpus-mini.zip')


def train_or_load_classifier():
    model_path = sogou_corpus_path + '.ser'
    if os.path.isfile(model_path):
        return NaiveBayesClassifier(IOUtil.readObjectFrom(model_path))
    classifier = NaiveBayesClassifier()
    classifier.train(sogou_corpus_path)
    model = classifier.getModel()
    IOUtil.saveObjectTo(model, model_path)
    return NaiveBayesClassifier(model)
github hankcs / pyhanlp / tests / test_multithread.py View on Github external
if sys.version_info[0] < 3:
    reload(sys)
    sys.setdefaultencoding("utf-8")
    # raise "Must be using Python 3"

from absl import flags  # absl-py
from absl import logging  # absl-py

FLAGS = flags.FLAGS
import unittest
import threading
import time
from pyhanlp import HanLP, SafeJClass

# 在线程体外部用SafeJClass线程安全地引入类名
CRFLexicalAnalyzer = SafeJClass("com.hankcs.hanlp.model.crf.CRFLexicalAnalyzer")


class MyThread(threading.Thread):
    def __init__(self, name, counter, analyzer):
        threading.Thread.__init__(self)
        self.thread_name = name
        self.counter = counter
        self.analyzer = analyzer

    def run(self):
        print("Starting " + self.thread_name)
        while self.counter:
            time.sleep(1)
            sentence = self.analyzer.analyze("商品和服务")
            print("%s: %s, seg: %s" % (self.thread_name, time.ctime(time.time()), sentence))
            self.counter -= 1
github hankcs / pyhanlp / tests / test_utility.py View on Github external
def test_data_path():
    """
    获取测试数据路径,位于$root/data/test,根目录由配置文件指定。
    :return:
    """
    data_path = os.path.join(HANLP_DATA_PATH, 'test')
    if not os.path.isdir(data_path):
        os.mkdir(data_path)
    return data_path
github hankcs / pyhanlp / pyhanlp / static / __init__.py View on Github external
ratio = progress_size / total_size
                ratio = max(1e-8, ratio)
                percent = ratio * 100
                eta = duration / ratio * (1 - ratio)
                minutes = eta / 60
                seconds = eta % 60
                sys.stdout.write("\r%.2f%%, %d MB, %d KB/s, 还有 %d 分 %2d 秒   " %
                                 (percent, progress_size / (1024 * 1024), speed, minutes, seconds))
                sys.stdout.flush()

            import socket
            socket.setdefaulttimeout(10)
            urllib.urlretrieve(url, tmp_path, reporthook)
            print()
        except BaseException as e:
            eprint('下载失败 {} 由于 {}'.format(url, repr(e)))
            doc_url = 'https://github.com/hankcs/pyhanlp'
            eprint('请参考 %s 执行手动安装.' % doc_url)
            eprint('或手动下载 {} 到 {}'.format(url, path))
            if os.path.isfile(tmp_path):
                os.remove(tmp_path)
            browser_open(doc_url)
            exit(1)
        remove_file(path)
        os.rename(tmp_path, path)
    return True
github hankcs / pyhanlp / pyhanlp / __init__.py View on Github external
STATIC_ROOT)
    else:
        HANLP_JAR_VERSION = os.path.basename(HANLP_JAR_PATH)[len('hanlp-'):-len('.jar')]

        if HANLP_VERBOSE:
            print("加载 HanLP jar [%s] ..." % HANLP_JAR_PATH)
            print("加载 HanLP config [%s/hanlp.properties] ..." % (STATIC_ROOT))
            print("加载 HanLP data [%s/data] ..." % (STATIC_ROOT))

    java_url = 'https://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html'
    pathsep = os.pathsep
    jvm_path = None
    try:
        jvm_path = getDefaultJVMPath()
    except JVMNotFoundException as e:
        eprint('找不到Java,请安装JDK8:%s' % java_url)
        browser_open(java_url)
        exit(1)
    except JVMNotSupportedException as e:
        eprint('Java位数与Python不一致,请重新安装一致的Java、Python、JPype1(必须都为32位或64位)')
        browser_open(java_url)
        exit(1)
    if platform.system().startswith('CYGWIN'):
        if not jvm_path.startswith('/cygdrive'):  # CYGWIN 使用了宿主机器的JVM,必须将路径翻译为真实路径
            pathsep = ';'
            if STATIC_ROOT.startswith('/usr/lib'):
                cygwin_root = os.popen('cygpath -w /').read().strip().replace('\\', '/')
                STATIC_ROOT = cygwin_root + STATIC_ROOT[len('/usr'):]
                HANLP_JAR_PATH = cygwin_root + HANLP_JAR_PATH[len('/usr'):]
                PATH_CONFIG = cygwin_root + PATH_CONFIG[len('/usr'):]
            elif STATIC_ROOT.startswith('/cygdrive'):
                driver = STATIC_ROOT.split('/')
github hankcs / pyhanlp / pyhanlp / __init__.py View on Github external
# 启动JVM
    startJVM(
        jvm_path,
        JAVA_JAR_CLASSPATH,
        "-Xms%s" %
        HANLP_JVM_XMS,
        "-Xmx%s" %
        HANLP_JVM_XMX, convertStrings=True)
    # 确保启动正常
    try:
        JClass('com.hankcs.hanlp.HanLP')
    except java.lang.NoClassDefFoundError as e:
        from pyhanlp.static import install_hanlp_jar
        eprint('你的 {} 破损了,现在重新下载'.format(HANLP_JAR_PATH))
        install_hanlp_jar()
        eprint('下载成功,请重新启动程序')
        exit(1)
github hankcs / pyhanlp / pyhanlp / static / __init__.py View on Github external
percent = ratio * 100
                eta = duration / ratio * (1 - ratio)
                minutes = eta / 60
                seconds = eta % 60
                sys.stdout.write("\r%.2f%%, %d MB, %d KB/s, 还有 %d 分 %2d 秒   " %
                                 (percent, progress_size / (1024 * 1024), speed, minutes, seconds))
                sys.stdout.flush()

            import socket
            socket.setdefaulttimeout(10)
            urllib.urlretrieve(url, tmp_path, reporthook)
            print()
        except BaseException as e:
            eprint('下载失败 {} 由于 {}'.format(url, repr(e)))
            doc_url = 'https://github.com/hankcs/pyhanlp'
            eprint('请参考 %s 执行手动安装.' % doc_url)
            eprint('或手动下载 {} 到 {}'.format(url, path))
            if os.path.isfile(tmp_path):
                os.remove(tmp_path)
            browser_open(doc_url)
            exit(1)
        remove_file(path)
        os.rename(tmp_path, path)
    return True
github hankcs / pyhanlp / pyhanlp / __init__.py View on Github external
if HANLP_VERBOSE:
            print("加载 HanLP jar [%s] ..." % HANLP_JAR_PATH)
            print("加载 HanLP config [%s/hanlp.properties] ..." % (STATIC_ROOT))
            print("加载 HanLP data [%s/data] ..." % (STATIC_ROOT))

    java_url = 'https://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html'
    pathsep = os.pathsep
    jvm_path = None
    try:
        jvm_path = getDefaultJVMPath()
    except JVMNotFoundException as e:
        eprint('找不到Java,请安装JDK8:%s' % java_url)
        browser_open(java_url)
        exit(1)
    except JVMNotSupportedException as e:
        eprint('Java位数与Python不一致,请重新安装一致的Java、Python、JPype1(必须都为32位或64位)')
        browser_open(java_url)
        exit(1)
    if platform.system().startswith('CYGWIN'):
        if not jvm_path.startswith('/cygdrive'):  # CYGWIN 使用了宿主机器的JVM,必须将路径翻译为真实路径
            pathsep = ';'
            if STATIC_ROOT.startswith('/usr/lib'):
                cygwin_root = os.popen('cygpath -w /').read().strip().replace('\\', '/')
                STATIC_ROOT = cygwin_root + STATIC_ROOT[len('/usr'):]
                HANLP_JAR_PATH = cygwin_root + HANLP_JAR_PATH[len('/usr'):]
                PATH_CONFIG = cygwin_root + PATH_CONFIG[len('/usr'):]
            elif STATIC_ROOT.startswith('/cygdrive'):
                driver = STATIC_ROOT.split('/')
                cygwin_driver = '/'.join(driver[:3])
                win_driver = driver[2].upper() + ':'
                HANLP_JAR_PATH = HANLP_JAR_PATH.replace(cygwin_driver, win_driver)
                STATIC_ROOT = STATIC_ROOT.replace(cygwin_driver, win_driver)
github hankcs / pyhanlp / pyhanlp / __init__.py View on Github external
JAVA_JAR_CLASSPATH = JAVA_JAR_CLASSPATH + pathsep + os.path.join(STATIC_ROOT, jar)
    if HANLP_VERBOSE: print("设置 JAVA_JAR_CLASSPATH [%s]" % JAVA_JAR_CLASSPATH)
    # 启动JVM
    startJVM(
        jvm_path,
        JAVA_JAR_CLASSPATH,
        "-Xms%s" %
        HANLP_JVM_XMS,
        "-Xmx%s" %
        HANLP_JVM_XMX, convertStrings=True)
    # 确保启动正常
    try:
        JClass('com.hankcs.hanlp.HanLP')
    except java.lang.NoClassDefFoundError as e:
        from pyhanlp.static import install_hanlp_jar
        eprint('你的 {} 破损了,现在重新下载'.format(HANLP_JAR_PATH))
        install_hanlp_jar()
        eprint('下载成功,请重新启动程序')
        exit(1)