Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_symmetrization(self):
sentenses = [("僕 は 男 です", "I am a man"),
("私 は 女 です", "I am a girl"),
("私 は 先生 です", "I am a teacher"),
("彼女 は 先生 です", "She is a teacher"),
("彼 は 先生 です", "He is a teacher"),
]
corpus = mkcorpus(sentenses)
es = "私 は 先生 です".split()
fs = "I am a teacher".split()
syn = symmetrization(es, fs, corpus)
ans = set([(1, 1), (1, 2), (2, 3), (3, 4), (4, 3)])
self.assertEqual(syn, ans)
(('that', 'he', 'will', 'stay', 'in', 'the', 'house'),
('dass', 'er', 'im', 'haus', 'bleibt')),
(('will', 'stay'), ('bleibt',)),
(('will', 'stay', 'in', 'the', 'house'),
('im', 'haus', 'bleibt'))])
self.assertEqual(phrase_extract(es, fs, alignment), ans)
# another test
es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
sentenses = [("僕 は 男 です", "I am a man"),
("私 は 女 です", "I am a girl"),
("私 は 先生 です", "I am a teacher"),
("彼女 は 先生 です", "She is a teacher"),
("彼 は 先生 です", "He is a teacher"),
]
corpus = mkcorpus(sentenses)
alignment = symmetrization(es, fs, corpus)
ans = set([(('\xe3\x81\xaf',
'\xe5\x85\x88\xe7\x94\x9f',
'\xe3\x81\xa7\xe3\x81\x99'),
('a', 'teacher')),
(('\xe5\x85\x88\xe7\x94\x9f',), ('teacher',)),
(('\xe7\xa7\x81',), ('I', 'am')),
(('\xe7\xa7\x81',
'\xe3\x81\xaf',
'\xe5\x85\x88\xe7\x94\x9f',
'\xe3\x81\xa7\xe3\x81\x99'),
('I', 'am', 'a', 'teacher'))])
self.assertEqual(phrase_extract(es, fs, alignment), ans)
(8, 8), (9, 9), (5, 10), (6, 10)]
from smt.utils.utility import matrix
print(matrix(len(es), len(fs), e2f, es, fs))
print(matrix(len(es), len(fs), f2e, es, fs))
ali = _alignment(es, fs, e2f, f2e)
print(matrix(len(es), len(fs), ali, es, fs))
# test for symmetrization
from smt.utils.utility import mkcorpus
sentenses = [("僕 は 男 です", "I am a man"),
("私 は 女 です", "I am a girl"),
("私 は 先生 です", "I am a teacher"),
("彼女 は 先生 です", "She is a teacher"),
("彼 は 先生 です", "He is a teacher"),
]
corpus = mkcorpus(sentenses)
es = "私 は 先生 です".split()
fs = "I am a teacher".split()
syn = symmetrization(es, fs, corpus)
pprint(syn)
print(matrix(len(es), len(fs), syn, es, fs))
# (6, 10),
# (7, 8),
# (8, 8),
# (9, 9)])
#pprint(phrase_extract(es, fs, alignment))
# test2
from smt.utils.utility import mkcorpus
from word_alignment import symmetrization
sentenses = [("僕 は 男 です", "I am a man"),
("私 は 女 です", "I am a girl"),
("私 は 先生 です", "I am a teacher"),
("彼女 は 先生 です", "She is a teacher"),
("彼 は 先生 です", "He is a teacher"),
]
corpus = mkcorpus(sentenses)
es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
alignment = symmetrization(es, fs, corpus)
ext = phrase_extract(es, fs, alignment)
for e, f in ext:
print(' '.join(e), "<->", ' '.join(f))
## phrases
fs = "I am a teacher".split()
phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext])
print(phrases)
def test_available_phrases():
from smt.utils.utility import mkcorpus
from smt.phrase.word_alignment import symmetrization
sentenses = [("僕 は 男 です", "I am a man"),
("私 は 女 です", "I am a girl"),
("私 は 先生 です", "I am a teacher"),
("彼女 は 先生 です", "She is a teacher"),
("彼 は 先生 です", "He is a teacher"),
]
corpus = mkcorpus(sentenses)
es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
alignment = symmetrization(es, fs, corpus)
ext = phrase_extract(es, fs, alignment)
ans = ("は 先生 です <-> a teacher",
"先生 <-> teacher"
"私 <-> I am"
"私 は 先生 です <-> I am a teacher")
for e, f in ext:
print("{} {} {}".format(' '.join(e), "<->", ' '.join(f)))
## phrases
fs = "I am a teacher".split()
phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext])
print(phrases)
ans = {((1, 'I'), (2, 'am')),
((1, 'I'), (2, 'am'), (3, 'a'), (4, 'teacher')),
def train(sentences, loop_count=1000):
#for i, j in sentences:
# print(i, j)
corpus = utility.mkcorpus(sentences)
return _train(corpus, loop_count)