Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_sentence_tokenize_with_combined():
corpus = SentenceTokenizer()
expect = ["わんわん。", "「にゃ?」(にゃー)わんわん。", "「わおーん。」(犬より。)"]
result = corpus.tokenize(DOCUMENT4)
assert expect == result
def test_sentence_tokenize():
corpus = SentenceTokenizer()
expect = ["私は猫である。", "にゃお。", "にゃにゃ", "わんわん。", "にゃーにゃー。"]
result = corpus.tokenize(DOCUMENT1)
assert expect == result
def test_sentence_tokenize_with_bracket():
corpus = SentenceTokenizer()
expect = ["私は猫である(ただしかわいいものとする。異議は認める)。", "にゃお。", "にゃにゃ"]
result = corpus.tokenize(DOCUMENT2)
assert expect == result
def test_sentence_tokenize_with_quotation():
corpus = SentenceTokenizer()
expect = ["猫「にゃおにゃ。ただしかわいいものとする。異議は認める」。", "にゃお。", "にゃにゃ"]
result = corpus.tokenize(DOCUMENT3)
assert expect == result
def tokenize(self, document) -> List[str]:
"""
Divide a raw document into sentences.
:param document: a raw document
:type document: str
:return: list of sentences
:rtype list[str]
"""
for pattern in SentenceTokenizer.PATTERNS:
pattern = re.compile(pattern) # type: ignore
document = re.sub(pattern, self.conv_period, document)
result = []
for line in document.split("\n"):
line = line.rstrip()
line = line.replace("\n", "")
line = line.replace("\r", "")
line = line.replace("。", "。\n")
sentences = line.split("\n")
for sentence in sentences:
if not sentence:
continue
period_special = SentenceTokenizer.PERIOD_SPECIAL