Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_corpus_kolaw():
from konlpy.corpus import kolaw
fids = kolaw.fileids()
kolaw.abspath()
kolaw.abspath(fids[0])
assert kolaw.name == 'kolaw'
assert kolaw.open('constitution.txt').read(10) ==\
u'\ub300\ud55c\ubbfc\uad6d\ud5cc\ubc95\n\n\uc720\uad6c'
#! /usr/bin/python2.7
# -*- coding: utf-8 -*-
from konlpy.tag import Kkma
from konlpy.corpus import kolaw
from konlpy.utils import pprint
from nltk import collocations
measures = collocations.BigramAssocMeasures()
doc = kolaw.open('constitution.txt').read()
print('\nCollocations among tagged words:')
tagged_words = Kkma().pos(doc)
finder = collocations.BigramCollocationFinder.from_words(tagged_words)
pprint(finder.nbest(measures.pmi, 10)) # top 5 n-grams with highest PMI
print('\nCollocations among words:')
words = [w for w, t in tagged_words]
ignored_words = [u'안녕']
finder = collocations.BigramCollocationFinder.from_words(words)
finder.apply_word_filter(lambda w: len(w) < 2 or w in ignored_words)
finder.apply_freq_filter(3) # only bigrams that appear 3+ times
pprint(finder.nbest(measures.pmi, 10))
print('\nCollocations among tags:')
tags = [t for w, t in tagged_words]