How to use the ngram.SentenceNgramSampler function in ngram

To help you get started, we’ve selected a few ngram examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github TartuNLP / ngram2vec / extract_ngrams.py View on Github external
import sys

import ngram

if __name__ == "__main__":
	dataFile = sys.argv[1]
	outFile = sys.argv[2]
	beta = 0.125
	freqFilter = [20,120,90]

	lines = ngram.SentenceNgramSampler(dataFile, minCounts=freqFilter, ngramThresholdBeta=beta)
  
	with open(outFile, "w") as f:
		for i in lines:
			f.write(" ".join(i) + "\n")
	print("Job finished")
github TartuNLP / ngram2vec / learnmdl.py View on Github external
#Factored Estonian data:
	tokFactor = 1
	posFactor = 2
	firstPosFilter = "A,S,H,V,X,D,G,U,Y"
	lastPosFilter = "S,H,V,X,K,Y"
	
	freqFilter = [5, 50]
	somePosFilter = None
	crazyBigMFCorpus = True
	beta = 0.125
	epochs = 10
	
	logging.basicConfig(level = logging.INFO)

	lines = ngram.SentenceNgramSampler(dataFile, minCounts = freqFilter, tokFactor = tokFactor, posFactor = posFactor, firstPosFilter = firstPosFilter, lastPosFilter = lastPosFilter, atLeastOnePosFilter = somePosFilter, ngramThresholdBeta = beta, crazyBigMFCorpus = crazyBigMFCorpus)
	
	if len(freqFilter) > 1:
		debug("Initializing")
		for line in lines:
			pass

	if epochs > 0:
		model = Word2Vec(workers=60, sg=1, hs=1, iter=10, min_count=freqFilter[0])
		
		debug("Building vocab")
		model.build_vocab(lines)
		
		debug("Learning")
		for i in range(epochs):
			model.train(lines, total_examples = len(lines), epochs = 1)
			model.save(modelFile + ".trainable." + str(i))
github TartuNLP / ngram2vec / ngram.py View on Github external
def _applyJoinOps(self, sentence, toJoin):
		result = [t for t, _ in sentence]
		
		for op in sorted(toJoin, key=lambda x: -min(x)):
			result = result[:min(op)] + ["__".join([sentence[i][0] for i in sorted(op)])] + result[max(op)+1:]
		
		return result
	
	def __iter__(self):
		return self

if __name__ == "__main__":
	logging.basicConfig(level = logging.INFO)
	
	sampler = SentenceNgramSampler(sys.argv[1], minCounts = [2, 2, 2])
	
	for snt in sampler:
		print(snt)
	
	print("Second iteration")
	
	for snt in sampler:
		print(snt)