How to use the natural.TreebankWordTokenizer function in natural

To help you get started, we’ve selected a few natural examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Hugo-ter-Doest / chart-parsers / example / example_with_wordnet.js View on Github external
var fs = require('fs');
var natural = require('natural');
var Tagger = require('simple-pos-tagger');

var ChartParsers = require('../index');
var parserFactory = new ChartParsers.ParserFactory();
var GrammarParser = ChartParsers.GrammarParser;

var path = './data/';
var sentences_file = path + 'sentences.txt';
var grammar_file = path + 'English grammar using Wordnet tags.txt';

var tagger_config_file =  '../node_modules/simple-pos-tagger/data/English/lexicon_files.json';

tokenizer = new natural.TreebankWordTokenizer();
var wordnet = new natural.WordNet();
var sentences;

function initialise(callback) {
  // read sentences from file
  fs.readFile(sentences_file, 'utf8', function (error, sentences_text) {
    if (error) {
      logger.error(error);
    }
    sentences = sentences_text.split('\n');
    // read grammar from file
    fs.readFile(grammar_file, 'utf8', function (error, grammar_text) {
      if (error) {
        logger.error(error);
      }
      // parse the grammar
github velniukas / nlp2json / server.js View on Github external
var natural = require('natural'),
  pos = require('pos'),
  wordnet = new natural.WordNet(),
  _ = require('underscore'),
  NGrams = natural.NGrams,
  classifier = new natural.BayesClassifier(),
  tokenizer = new natural.TreebankWordTokenizer(); // natural.WordTokenizer();

  natural.PorterStemmer.attach();

// load the classifier data and learn the schema
/*natural.BayesClassifier.load('./classifier.json', null, function(err, classifier) {
	// if the classifier hasn't been saved, then calculate it now
	if (err) {
		var traindata = require('./trainingdata.json');
		for (i in traindata)
		{
			classifier.addDocument(traindata[i].query, traindata[i].category);
		}
		classifier.train();
		classifier.save('classifier.json', function(err, classifier) {

		});
github SuperMarcus / pgoogle / index.js View on Github external
async function obfuscate(previous) {
    const { notice, status, stageHandler, useObfuscation } = previous.params;

    if (!useObfuscation) {
        notice("Skipping synonyms replacing process.");
        return previous;
    }

    await notice("Obfuscating sentences...");
    await status("Preparing obfuscator...");

    let dummy = d => d;
    let wnet = new WNet({
        dataDir: wnetdb.path
    });
    let tzr = (new natural.TreebankWordTokenizer()).tokenize;
    let isWord = /\w+/;

    let ntrBf = path.join(path.dirname(require.resolve("natural")), "brill_pos_tagger");
    let defaultCat = '?';

    let lex = new natural.Lexicon(ntrBf + "/data/English/lexicon_from_posjs.json", defaultCat);
    let rules = new natural.RuleSet(ntrBf + "/data/English/tr_from_posjs.txt");
    let tagger = new natural.BrillPOSTagger(lex, rules);
    let np = new natural.NounInflector();
    let pvp = new natural.PresentVerbInflector();
    let stmr = natural.PorterStemmer;

    let _accumunator = 0;
    let _total = 0;
    let _st = "";
    let _swp = (w) => ( isWord.test(w[0]) );
github Houshalter / amabot / amabot.js View on Github external
test();
	});
}

function arraysEqual(arr1, arr2) {
    if(arr1.length !== arr2.length)
        return false;
    for(var i = arr1.length; i--;) {
        if(arr1[i] !== arr2[i])
            return false;
    }

    return true;
}

var tokenizer = new natural.TreebankWordTokenizer();
var NGrams = natural.NGrams;
function tokenize(text){
	return tokenizer.tokenize(text.toLowerCase())
}

function matchMarkov(search, text, n){
	var search = tokenize(search);
	var text = tokenize(text);
	var searchgrams = NGrams.ngrams(search, n);
	var textgrams = NGrams.ngrams(text, n);
	var count = 0;
for (textgramNum in textgrams){
		for (searchgramNum in searchgrams){
			if (arraysEqual(searchgrams[searchgramNum], textgrams[textgramNum])){
				count += 1;
			}
github GaloisInc / FiveUI / exampleData / ruleSets / language-processing / natural / upGoerFive.js View on Github external
'especially', 'fig', 'afraid', 'huge', 'sister', 'steel', 'discuss',
'forward', 'similar', 'guide', 'experience', 'score', 'apple',
'bought', 'led', 'pitch', 'coat', 'mass', 'card', 'band', 'rope',
'slip', 'win', 'dream', 'evening', 'condition', 'feed', 'tool',
']total', 'basic', 'smell', 'valley', 'nor', 'double', 'seat',
'arrive', 'master', 'track', 'parent', 'shore', 'division', 'sheet',
'substance', 'favor', 'connect', 'post', 'spend', 'chord', 'fat',
'glad', 'original', 'share', 'station', 'dad', 'bread', 'charge',
'proper', 'bar', 'offer', 'segment', 'slave', 'duck', 'instant',
'market', 'degree', 'populate', 'chick', 'dear', 'enemy', 'reply',
'drink', 'occur', 'support', 'speech', 'nature', 'range', 'steam',
'motion', 'path', 'liquid', 'log', 'meant', 'quotient', 'teeth',
'shell', 'neck' ];

var stemmer = natural.PorterStemmer;
var tokenizer = new natural.TreebankWordTokenizer();

var getTextNodesIn = function (node, includeWhitespaceNodes) {
  var textNodes = [], whitespace = /^\s*$/;

  function getTextNodes(node) {
    if ($(node).attr('id') == 'fiveui-top') {
      return;
    }
    if (node.nodeType == 3) {
      if (includeWhitespaceNodes || !whitespace.test(node.nodeValue)) {
        textNodes.push(node);
      }
    } else {
      for (var i = 0, len = node.childNodes.length; i < len; ++i) {
        getTextNodes(node.childNodes[i]);
      }
github blprnt / ArtArchive / Class4 / marc_template / index.js View on Github external
function checkForWords(_r, _w) {

	//Tokenize the record (break it into words)
	tokenizer = new natural.TreebankWordTokenizer();
	var words = [tokenizer.tokenize(_r)][0];

	var chk = {chk:false, w:null};
	
	for (var i = 0; i < _w.length; i++) {
		for (var j = 0; j < words.length; j++) {
			//Stem the word we're checking so that dogs becomes dog, etc.
			var stemmedWord = natural.PorterStemmer.stem(words[j]);
			if (stemmedWord.toLowerCase() == _w[i].toLowerCase()) {
				chk.chk = true;
				chk.w = _w[i];
			}
		}
	}
	return(chk);
}
github linanqiu / lexrank / lexrank.js View on Github external
function summarize(text, lines, callback) {
    var sentenceTokenizer = new Tokenizer('utterer');
    sentenceTokenizer.setEntry(text);
    var sentences = sentenceTokenizer.getSentences();
    var sentencesOriginal = sentences.slice();

    var wordTokenizer = new natural.TreebankWordTokenizer();
    sentences.forEach(function (sentence, index, array) {
      array[index] = wordTokenizer.tokenize(sentence.toLowerCase());
    });

    var matrix = constructMatrix(sentences);
    var sortedSentences = pageRank(matrix, sentencesOriginal);

    var topLines = [];

    for (var i = 0; i < Math.min(lines, sortedSentences.length); i++) {
      topLines.push(sortedSentences[i]);
    }

    topLines.sort(function (a, b) {
      return a.index - b.index;
    });
github blprnt / ArtArchive / Class6 / MARC_Network / index.js View on Github external
});

	//And what happens when it finishes parsing all of the records. 
	parser.on('end', function() {
	    onParseFinished();
	}); 

}




//------------------CHECK FOR MATCHES FUNCTION ---------------------------------------------------------------------------!!
//This function checks any string (input) against any list of candidate strings (candidates)
//Uses NLP to split the sentence into words and also to stem
var tokenizer = new natural.TreebankWordTokenizer();
//Used to singularize the words so that frogs matches frog. Wether or not you have to do this will depend on what data you're trying to match.
//For example if it's something *already* standardized (ie. Subjects) you won't have to. 
//This function is SLOW if there are a lot of words to check against 
var nounInflector = new natural.NounInflector();

function checkForMatches(input, candidates) {

	//Tokenize the record (break it into words)
	var words = [tokenizer.tokenize(input)][0];

	//Set up our return object, this is the state that is returned with no matches
	var chk = {chk:false, words:[]};
	
	for (var i = 0; i < candidates.length; i++) {
		var cand = nounInflector.singularize(candidates[i].toLowerCase());
		for (var j = 0; j < words.length; j++) {

natural

General natural language (tokenizing, stemming (English, Russian, Spanish), part-of-speech tagging, sentiment analysis, classification, inflection, phonetics, tfidf, WordNet, jaro-winkler, Levenshtein distance, Dice's Coefficient) facilities for node.

MIT
Latest version published 1 month ago

Package Health Score

98 / 100
Full package analysis