Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_pinyin_initials():
"""包含声明和韵母的词语"""
hans = '中心'
# 默认风格,带声调
assert pinyin(hans) == [['zh\u014dng'], ['x\u012bn']]
assert pinyin(hans, strict=False) == [['zh\u014dng'], ['x\u012bn']]
# 普通风格,不带声调
assert pinyin(hans, NORMAL) == [['zhong'], ['xin']]
assert pinyin(hans, NORMAL, strict=False) == [['zhong'], ['xin']]
# 声调风格,拼音声调在韵母第一个字母上
assert pinyin(hans, TONE) == [['zh\u014dng'], ['x\u012bn']]
assert pinyin(hans, TONE, strict=False) == [['zh\u014dng'], ['x\u012bn']]
# 声调风格2,即拼音声调在各个声母之后,用数字 [1-4] 进行表示
assert pinyin(hans, TONE2) == [['zho1ng'], ['xi1n']]
assert pinyin(hans, TONE2, strict=False) == [['zho1ng'], ['xi1n']]
# 声调风格3,即拼音声调在各个拼音之后,用数字 [1-4] 进行表示
assert pinyin(hans, TONE3) == [['zhong1'], ['xin1']]
assert pinyin(hans, TONE3, strict=False) == [['zhong1'], ['xin1']]
# 声母风格,只返回各个拼音的声母部分
assert pinyin(hans, INITIALS) == [['zh'], ['x']]
assert pinyin(hans, INITIALS, strict=False) == [['zh'], ['x']]
# 首字母风格,只返回拼音的首字母部分
assert pinyin(hans, FIRST_LETTER) == [['z'], ['x']]
assert pinyin(hans, FIRST_LETTER, strict=False) == [['z'], ['x']]
['居', dict(style=FINALS), ['v']],
['居', dict(style=FINALS, strict=False), ['u']],
['区', dict(style=NORMAL), ['qu']],
['区', dict(style=FINALS), ['v']],
['区', dict(style=FINALS, strict=False), ['u']],
['虚', dict(style=NORMAL), ['xu']],
['虚', dict(style=FINALS), ['v']],
['虚', dict(style=FINALS, strict=False), ['u']],
['女', dict(style=NORMAL), ['nv']],
['女', dict(style=FINALS), ['v']],
['女', dict(style=FINALS, strict=False), ['v']],
['吕', dict(style=NORMAL), ['lv']],
['吕', dict(style=FINALS), ['v']],
['吕', dict(style=FINALS, strict=False), ['v']],
['具', dict(style=NORMAL), ['ju']],
['具', dict(style=NORMAL, strict=False), ['ju']],
['具', dict(style=TONE), ['jù']],
['具', dict(style=TONE, strict=False), ['jù']],
['具', dict(style=TONE2), ['ju4']],
['具', dict(style=TONE2, strict=False), ['ju4']],
['具', dict(style=TONE3), ['ju4']],
['具', dict(style=TONE3, strict=False), ['ju4']],
['具', dict(style=INITIALS), ['j']],
['具', dict(style=INITIALS, strict=False), ['j']],
['具', dict(style=FIRST_LETTER), ['j']],
['具', dict(style=FIRST_LETTER, strict=False), ['j']],
['具', dict(style=FINALS), ['v']],
['具', dict(style=FINALS, strict=False), ['u']],
['具', dict(style=FINALS_TONE), ['ǜ']],
['具', dict(style=FINALS_TONE, strict=False), ['ù']],
['具', dict(style=FINALS_TONE2), ['v4']],
['儿', dict(style=FINALS), ['er']],
]
@pytest.mark.parametrize('hans, kwargs, result', data_for_finals)
def test_finals(hans, kwargs, result):
assert lazy_pinyin(hans, **kwargs) == result
assert pinyin(hans, **kwargs) == [result]
# 零声母
data_for_zero_consonant = [
# i行的韵母,前面没有声母的时候,写成yi(衣),ya(呀),ye(耶),yao(腰),
# you(忧),yan(烟),yin(因),yang(央),ying(英),yong(雍)。
['衣', dict(style=NORMAL), ['yi']],
['衣', dict(style=FINALS), ['i']],
['衣', dict(style=FINALS, strict=False), ['i']],
['呀', dict(style=NORMAL), ['ya']],
['呀', dict(style=FINALS), ['ia']],
['呀', dict(style=FINALS, strict=False), ['a']],
['耶', dict(style=NORMAL), ['ye']],
['耶', dict(style=FINALS), ['ie']],
['耶', dict(style=FINALS, strict=False), ['e']],
['腰', dict(style=NORMAL), ['yao']],
['腰', dict(style=FINALS), ['iao']],
['腰', dict(style=FINALS, strict=False), ['ao']],
['忧', dict(style=NORMAL), ['you']],
['忧', dict(style=FINALS), ['iou']],
['忧', dict(style=FINALS, strict=False), ['ou']],
['烟', dict(style=NORMAL), ['yan']],
['烟', dict(style=FINALS), ['ian']],
def init_emission():
"""
初始化发射概率
"""
character_pinyin_map = {}
for phrase, frequency in iter_dict():
pinyins = pinyin(phrase, style=NORMAL)
for character, py in zip(phrase, pinyins):
character_pinyin_count = len(py)
if character not in character_pinyin_map:
character_pinyin_map[character] = \
{x: frequency/character_pinyin_count for x in py}
else:
pinyin_freq_map = character_pinyin_map[character]
for x in py:
pinyin_freq_map[x] = pinyin_freq_map.get(x, 0) + \
frequency/character_pinyin_count
for character, pinyin_map in character_pinyin_map.iteritems():
sum_frequency = sum(pinyin_map.values())
for py, frequency in pinyin_map.iteritems():
Emission.add(character, py, log(frequency/sum_frequency))
def get_homophones_by_char(input_char):
"""
根据汉字取同音字
:param input_char:
:return:
"""
result = []
# CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字
for i in range(0x4e00, 0x9fa6):
if pinyin([chr(i)], style=pypinyin.NORMAL)[0][0] == pinyin(input_char, style=pypinyin.NORMAL)[0][0]:
result.append(chr(i))
return result
def check_pinyin_same(input, match_word):
input_pinyin = pypinyin.pinyin(input, style=pypinyin.NORMAL)[0][0]
match_word_pinyin = pypinyin.pinyin(match_word, style=pypinyin.NORMAL)[3][0]
return input_pinyin == match_word_pinyin
def check_pinyin_same(input, match_word):
input_pinyin = pypinyin.pinyin(input, style=pypinyin.NORMAL)[0][0]
match_word_pinyin = pypinyin.pinyin(match_word, style=pypinyin.NORMAL)[3][0]
return input_pinyin == match_word_pinyin
def get_homophones_by_char(input_char):
"""
根据汉字取同音字
:param input_char:
:return:
"""
result = []
# CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字
for i in range(0x4e00, 0x9fa6):
if pinyin([chr(i)], style=pypinyin.NORMAL)[0][0] == pinyin(input_char, style=pypinyin.NORMAL)[0][0]:
result.append(chr(i))
return result
l = list(map(lambda x: pinyin(x, heteronym=True,strict=True,style=pypinyin.NORMAL), chars))
# flatten list