Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_pinyin_initials():
"""包含声明和韵母的词语"""
hans = '中心'
# 默认风格,带声调
assert pinyin(hans) == [['zh\u014dng'], ['x\u012bn']]
assert pinyin(hans, strict=False) == [['zh\u014dng'], ['x\u012bn']]
# 普通风格,不带声调
assert pinyin(hans, NORMAL) == [['zhong'], ['xin']]
assert pinyin(hans, NORMAL, strict=False) == [['zhong'], ['xin']]
# 声调风格,拼音声调在韵母第一个字母上
assert pinyin(hans, TONE) == [['zh\u014dng'], ['x\u012bn']]
assert pinyin(hans, TONE, strict=False) == [['zh\u014dng'], ['x\u012bn']]
# 声调风格2,即拼音声调在各个声母之后,用数字 [1-4] 进行表示
assert pinyin(hans, TONE2) == [['zho1ng'], ['xi1n']]
assert pinyin(hans, TONE2, strict=False) == [['zho1ng'], ['xi1n']]
# 声调风格3,即拼音声调在各个拼音之后,用数字 [1-4] 进行表示
assert pinyin(hans, TONE3) == [['zhong1'], ['xin1']]
assert pinyin(hans, TONE3, strict=False) == [['zhong1'], ['xin1']]
# 声母风格,只返回各个拼音的声母部分
assert pinyin(hans, INITIALS) == [['zh'], ['x']]
assert pinyin(hans, INITIALS, strict=False) == [['zh'], ['x']]
# 首字母风格,只返回拼音的首字母部分
assert pinyin(hans, FIRST_LETTER) == [['z'], ['x']]
assert pinyin(hans, FIRST_LETTER, strict=False) == [['z'], ['x']]
# 注音风格,带声调
assert pinyin(hans, BOPOMOFO) == [['ㄓㄨㄥ'], ['ㄒㄧㄣ']]
assert pinyin(hans, BOPOMOFO, strict=False) == [['ㄓㄨㄥ'], ['ㄒㄧㄣ']]
# 注音风格,首字母
assert pinyin(hans, BOPOMOFO_FIRST) == [['ㄓ'], ['ㄒ']]
assert pinyin(hans, BOPOMOFO_FIRST, strict=False) == [['ㄓ'], ['ㄒ']]
['因', dict(style=FINALS, strict=False), ['in']],
['央', dict(style=NORMAL), ['yang']],
['央', dict(style=FINALS), ['iang']],
['央', dict(style=FINALS, strict=False), ['ang']],
['英', dict(style=NORMAL), ['ying']],
['英', dict(style=FINALS), ['ing']],
['英', dict(style=FINALS, strict=False), ['ing']],
['雍', dict(style=NORMAL), ['yong']],
['雍', dict(style=FINALS), ['iong']],
['雍', dict(style=FINALS, strict=False), ['ong']],
['宜', dict(style=NORMAL), ['yi']],
['宜', dict(style=NORMAL, strict=False), ['yi']],
['宜', dict(style=TONE), ['yí']],
['宜', dict(style=TONE, strict=False), ['yí']],
['宜', dict(style=TONE2), ['yi2']],
['宜', dict(style=TONE2, strict=False), ['yi2']],
['宜', dict(style=TONE3), ['yi2']],
['宜', dict(style=TONE3, strict=False), ['yi2']],
['宜', dict(style=INITIALS), ['']],
['宜', dict(style=INITIALS, strict=False), ['y']],
['宜', dict(style=FIRST_LETTER), ['y']],
['宜', dict(style=FIRST_LETTER, strict=False), ['y']],
['宜', dict(style=FINALS), ['i']],
['宜', dict(style=FINALS, strict=False), ['i']],
['宜', dict(style=FINALS_TONE), ['í']],
['宜', dict(style=FINALS_TONE, strict=False), ['í']],
['宜', dict(style=FINALS_TONE2), ['i2']],
['宜', dict(style=FINALS_TONE2, strict=False), ['i2']],
['宜', dict(style=FINALS_TONE3), ['i2']],
['宜', dict(style=FINALS_TONE3, strict=False), ['i2']],
# ü行的韵母,前面没有声母的时候,写成yu(迂),yue(约),yuan(冤),
['迂', dict(style=NORMAL), ['yu']],
['迂', dict(style=FINALS), ['v']],
['迂', dict(style=FINALS, strict=False), ['u']],
['约', dict(style=NORMAL), ['yue']],
['约', dict(style=FINALS), ['ve']],
['约', dict(style=FINALS, strict=False), ['ue']],
['冤', dict(style=NORMAL), ['yuan']],
['冤', dict(style=FINALS), ['van']],
['冤', dict(style=FINALS, strict=False), ['uan']],
['鱼', dict(style=NORMAL), ['yu']],
['鱼', dict(style=NORMAL, strict=False), ['yu']],
['鱼', dict(style=TONE), ['yú']],
['鱼', dict(style=TONE, strict=False), ['yú']],
['鱼', dict(style=TONE2), ['yu2']],
['鱼', dict(style=TONE2, strict=False), ['yu2']],
['鱼', dict(style=TONE3), ['yu2']],
['鱼', dict(style=TONE3, strict=False), ['yu2']],
['鱼', dict(style=INITIALS), ['']],
['鱼', dict(style=INITIALS, strict=False), ['y']],
['鱼', dict(style=FIRST_LETTER), ['y']],
['鱼', dict(style=FIRST_LETTER, strict=False), ['y']],
['鱼', dict(style=FINALS), ['v']],
['鱼', dict(style=FINALS, strict=False), ['u']],
['鱼', dict(style=FINALS_TONE), ['ǘ']],
['鱼', dict(style=FINALS_TONE, strict=False), ['ú']],
['鱼', dict(style=FINALS_TONE2), ['v2']],
['鱼', dict(style=FINALS_TONE2, strict=False), ['u2']],
['鱼', dict(style=FINALS_TONE3), ['v2']],
['鱼', dict(style=FINALS_TONE3, strict=False), ['u2']],
def test_errors():
hans = (
('啊', {'style': TONE2}, [['a']]),
('啊a', {'style': TONE2}, [['a'], ['a']]),
# 非中文字符,没有拼音
('⺁', {'style': TONE2}, [['\u2e81']]),
('⺁', {'style': TONE2, 'errors': 'ignore'}, []),
('⺁', {'style': TONE2, 'errors': 'replace'}, [['2e81']]),
('⺁⺁', {'style': TONE2, 'errors': 'replace'}, [['2e812e81']]),
('⺁⺁', {'style': TONE2, 'errors': lambda x: ['a' for _ in x]},
[['a'], ['a']]),
('⺁⺁', {'style': TONE2, 'errors': lambda x: [['a', 'b'], ['b', 'c']]},
[['a'], ['b']]),
('⺁⺁', {'style': TONE2, 'heteronym': True,
'errors': lambda x: [['a', 'b'], ['b', 'c']]},
[['a', 'b'], ['b', 'c']]),
# 中文字符,没有拼音
('鿅', {'style': TONE2}, [['\u9fc5']]),
('鿅', {'style': TONE2, 'errors': 'ignore'}, []),
('鿅', {'style': TONE2, 'errors': '233'}, []),
('鿅', {'style': TONE2, 'errors': 'replace'}, [['9fc5']]),
('鿅', {'style': TONE2, 'errors': lambda x: ['a']}, [['a']]),
('鿅', {'style': TONE2, 'errors': lambda x: None}, []),
('鿅鿅', {'style': TONE2, 'errors': lambda x: ['a' for _ in x]},
[['a'], ['a']]),
def hanzi_to_pinyin(txt):
"""
Returns a version of txt with Chinese characters replaced with alphanumeric
pinyin romanization
Args:
txt -- Chinese text with Chinese characters in it (unicode)
Returns:
unicode with romanized version of txt
"""
pinyin = pyp.lazy_pinyin(txt, style=pyp.TONE2)
return u''.join(pinyin)
def get_pinyin_sim(word1, word2):
i = 0
count = 0
while i < len(word1) and i < len(word2):
py1 = pinyin(word1[i], style=pypinyin.TONE2, heteronym=True)[0]
for p1 in py1:
is_contain = False
py2 = pinyin(word2[i], style=pypinyin.TONE2, heteronym=True)[0]
for p2 in py2:
if p1 == p2:
count += 1
is_contain = True
break
if is_contain:
break
i += 1
pinyin_sim = 2 * count * 1.0 / (len(word1) + len(word2))
return pinyin_sim
def get_homophones_by_pinyin(input_pinyin):
"""
根据拼音取同音字
:param input_pinyin:
:return:
"""
result = []
# CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字
for i in range(0x4e00, 0x9fa6):
if pinyin([chr(i)], style=pypinyin.TONE2)[0][0] == input_pinyin:
# TONE2: 中zho1ng
result.append(chr(i))
return result
def ch2p(speech):
if type(speech) == str:
# print('拼音转换: ', speech)
syllables = lazy_pinyin(speech, style=pypinyin.TONE2)
# print('---------1 ', speech, '----------')
syllables = text2pinyin(syllables)
text = ' '.join(syllables)
''''''
for alpha, pronuce in alpha_pronuce.items():
text = text.replace(alpha, pronuce)
text = text.replace(" "," ")
text = text.replace(" ", " ")
return text
else:
print("input format error")
def get_pinyin_sim(word1, word2):
i = 0
count = 0
while i < len(word1) and i < len(word2):
py1 = pinyin(word1[i], style=pypinyin.TONE2, heteronym=True)[0]
for p1 in py1:
is_contain = False
py2 = pinyin(word2[i], style=pypinyin.TONE2, heteronym=True)[0]
for p2 in py2:
if p1 == p2:
count += 1
is_contain = True
break
if is_contain:
break
i += 1
pinyin_sim = 2 * count * 1.0 / (len(word1) + len(word2))
return pinyin_sim