Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
'er',
'im',
'haus',
'bleibt')),
(('that',), (',', 'dass')),
(('that',), ('dass',)),
(('that', 'he'), (',', 'dass', 'er')),
(('that', 'he'), ('dass', 'er')),
(('that', 'he', 'will', 'stay', 'in', 'the', 'house'),
(',', 'dass', 'er', 'im', 'haus', 'bleibt')),
(('that', 'he', 'will', 'stay', 'in', 'the', 'house'),
('dass', 'er', 'im', 'haus', 'bleibt')),
(('will', 'stay'), ('bleibt',)),
(('will', 'stay', 'in', 'the', 'house'),
('im', 'haus', 'bleibt'))])
self.assertEqual(phrase_extract(es, fs, alignment), ans)
# another test
es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
sentenses = [("僕 は 男 です", "I am a man"),
("私 は 女 です", "I am a girl"),
("私 は 先生 です", "I am a teacher"),
("彼女 は 先生 です", "She is a teacher"),
("彼 は 先生 です", "He is a teacher"),
]
corpus = mkcorpus(sentenses)
alignment = symmetrization(es, fs, corpus)
ans = set([(('\xe3\x81\xaf',
'\xe5\x85\x88\xe7\x94\x9f',
'\xe3\x81\xa7\xe3\x81\x99'),
('a', 'teacher')),
(('\xe5\x85\x88\xe7\x94\x9f',), ('teacher',)),
("彼 は 先生 です", "He is a teacher"),
]
corpus = mkcorpus(sentenses)
alignment = symmetrization(es, fs, corpus)
ans = set([(('\xe3\x81\xaf',
'\xe5\x85\x88\xe7\x94\x9f',
'\xe3\x81\xa7\xe3\x81\x99'),
('a', 'teacher')),
(('\xe5\x85\x88\xe7\x94\x9f',), ('teacher',)),
(('\xe7\xa7\x81',), ('I', 'am')),
(('\xe7\xa7\x81',
'\xe3\x81\xaf',
'\xe5\x85\x88\xe7\x94\x9f',
'\xe3\x81\xa7\xe3\x81\x99'),
('I', 'am', 'a', 'teacher'))])
self.assertEqual(phrase_extract(es, fs, alignment), ans)
def db_phrase_extract(lang1, lang2,
lang1method=lambda x: x,
lang2method=lambda x: x,
init_val=1.0e-10,
db="sqlite:///:memory:"):
lang1s = lang1method(lang1).split()
lang2s = lang1method(lang2).split()
alignment = _db_symmetrization(lang1s, lang2s,
init_val=init_val,
db=db)
return phrase_extract.phrase_extract(lang1s, lang2s, alignment)