Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# -*- coding: utf-8 -*-
import os
import io
import re
import codecs
from pypinyin.phonetic_symbol import phonetic_symbol
from pypinyin.pinyin_dict import pinyin_dict
from pypinyin.style.tone import ToneConverter
ROOT = os.path.dirname(os.path.realpath(__file__))
tone_converter = ToneConverter()
tone3_2_tone_dict = {}
for k, v in pinyin_dict.items():
parts = v.split(',')
for part in parts:
part = part.strip()
if part:
tone3 = tone_converter.to_tone3(part).strip().lower()
if tone3:
tone3_2_tone_dict[tone3] = part
def tone3_to_tone1(tone3):
tone3 = tone3.strip().lower()
# 儿化
if tone3 == 'r5':
return 'er'
# 轻声
if '5' in tone3:
def test_no_copy(cleanup):
""" 禁用copy操作的测试 """
import pypinyin.core # noqa
assert pypinyin.core.PINYIN_DICT is not pypinyin.pinyin_dict.pinyin_dict
os.environ['PYPINYIN_NO_DICT_COPY'] = 'true'
reload(pypinyin.constants)
assert pypinyin.constants.PINYIN_DICT is pypinyin.pinyin_dict.pinyin_dict
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import jieba
from . import phrases_dict, phonetic_symbol, pinyin_dict
# 词语拼音库
PHRASES_DICT = phrases_dict.phrases_dict
# 拼音词库
PINYIN_DICT = pinyin_dict.pinyin_dict
# 声母表
INITIALS = "zh,ch,sh,b,p,m,f,d,t,n,l,g,k,h,j,q,x,r,z,c,s,yu,y,w".split(",")
# 韵母表
FINALS = "ang,eng,ing,ong,an,en,in,un,er,ai,ei,ui,ao,ou,iu,ie,ve,a,o,e,i,u,v"
FINALS = FINALS.split(",")
PINYIN_STYLE = {
'NORMAL': 0, # 普通风格,不带音标
'TONE': 1, # 标准风格,音标在韵母的第一个字母上
'TONE2': 2, # 声调中拼音之后,使用数字 1~4 标识
'INITIALS': 3, # 仅需要声母部分
'FIRST_LETTER': 4 # 仅保留首字母
}
# 带音标字符
PHONETIC_SYMBOL = phonetic_symbol.phonetic_symbol
url_base = 'http://www.zdic.net/z/jbs/zbh/bs/?jzbh=%s|%s'
cookies = requests.get('http://www.zdic.net/z/jbs/zbh/').cookies.get_dict()
word_list = []
timer = 5
for m in range(1, 66): # 总笔画数
sleep(timer)
for page_num in xrange(1, 10000): # 页数
url = url_base % (m, page_num)
logger.debug(url)
html = get_one_page(url, cookies=cookies, headers=headers)
words = parse_words(html)
if not words:
break
for word in words:
if word not in pinyin_dict:
logger.debug(repr(word))
word_list.append(word)
sleep(timer)
with io.open('words.txt', 'w', encoding='utf8') as f:
for word in word_list:
try:
f.write(word)
except Exception as e:
logger.debug(e + '\n' + repr(word))
import re
from enum import IntEnum, unique
from pypinyin import pinyin_dict
from pypinyin.compat import SUPPORT_UCS4
# 词语拼音库
if os.environ.get('PYPINYIN_NO_PHRASES'):
PHRASES_DICT = {}
else:
from pypinyin import phrases_dict
PHRASES_DICT = phrases_dict.phrases_dict
# 单字拼音库
PINYIN_DICT = pinyin_dict.pinyin_dict
# 利用环境变量控制不做copy操作(无自定义拼音库的情况), 以减少内存使用
if not os.environ.get('PYPINYIN_NO_DICT_COPY'):
PINYIN_DICT = PINYIN_DICT.copy()
PHRASES_DICT = PHRASES_DICT.copy()
# 匹配使用数字标识声调的字符的正则表达式
RE_TONE2 = re.compile(r'([aeoiuvnm])([1-4])$')
# 有拼音的汉字
if SUPPORT_UCS4:
RE_HANS = re.compile(
r'^(?:['
r'\u3007' # 〇
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
def get_words(unicode_range, url_base, headers, cookies):
m = 0
for n in xrange(int(unicode_range[0], 16), int(unicode_range[1], 16) + 1):
if n in pinyin_dict:
continue
if m > 900:
m = 0
sleep(120)
m += 1
yield get_word(n, url_base, headers, cookies)
sleep(1)