How to use the natural.RegexpTokenizer function in natural

To help you get started, we’ve selected a few natural examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github erelsgl / nlu-server / classifiers.js View on Github external
var lemmatizer = new Lemmatizer();

var antonyms = {}
//var data = fs.readFileSync("./antonyms.txt", 'utf8').split("\n")

/*_.each(data, function(value, key, list){
        var value1 = value.split(",")
        antonyms[value1[0]] = value1[1]
        antonyms[value1[1]] = value1[0]
}, this)
*/


var old_unused_tokenizer = {tokenize: function(sentence) { return sentence.split(/[ \t,;:.!?]/).filter(function(a){return !!a}); }}

var tokenizer = new natural.RegexpTokenizer({pattern: /[^a-zA-Z0-9\-\?]+/});

console.vlog = function(data) { fs.appendFileSync(log_file, data + '\n', 'utf8') };

// var tokenizer = new natural.WordTokenizer({'pattern':(/(\W+|\%)/)}); // WordTokenizer, TreebankWordTokenizer, WordPunctTokenizer
// var ngrams = new natural.NGrams.ngrams()

// var enhance = function (classifierType, featureExtractor, inputSplitter, featureLookupTable, labelLookupTable, preProcessor, postProcessor, TestSplitLabel, multiplyFeaturesByIDF, featureExpansion, featureExpansionScale, featureExpansionPhrase, featureFine, expansionParam) {
var enhance = function (classifierType, featureExtractor, inputSplitter, featureLookupTable, labelLookupTable, preProcessor, postProcessor, TestSplitLabel, multiplyFeaturesByIDF, featureOptions) {
// var enhance = function (classifierType, featureLookupTable, labelLookupTable) {
	return classifiers.EnhancedClassifier.bind(0, {
		normalizer: normalizer,

		inputSplitter: inputSplitter,
		featureOptions:featureOptions,

		// featureExpansion: featureExpansion,
github erelsgl / nlu-server / classifiers.js View on Github external
if ("input" in sample)
		sample = sample.input

/*	if (!('basic-dependencies' in sample['sentences']))
		throw new Error("train:"+train+" basic-dependencies not in the sample "+JSON.stringify(sample))
*/	
/*	if (!('sentences' in sample))
	   throw new Error("for some reason sentences not in sample "+JSON.stringify(sample))
*/
/*	if (!('tokens' in sample['sentences']))
	   throw new Error("for some reason tokens not in sample"+JSON.stringify(sample, null, 4))
*/
	if (_.isArray(sample['sentences']))
	   throw new Error("feAsync is only for object sentences")

	var tokenizer = new natural.RegexpTokenizer({pattern: /[^\%a-zA-Z0-9\-\?]+/});
	var text = regexpNormalizer(sample["text"].toLowerCase())
	
	console.vlog("feAsyncStanford: text: "+text)
	// the array of tokens
	// var tokenized = tokenizer.tokenize(text)
	// console.vlog("feAsyncStanford: tokenized: "+JSON.stringify(tokenized, null, 4))

	// sample['sentences'] = {"tokens":[]}

	// _.each(tokenized, function(value, key, list){
 //    	sample['sentences']['tokens'].push({
 //            "word": value,
 //            // "lemma": value[0]
 //            "lemma": natural.PorterStemmer.stem(value)
 //        	// "lemma": lemmerEng.lemmatize(value[0])
 //        })
github erelsgl / nlu-server / classifiers.js View on Github external
function getRule(text)
{
/*	if (!('tokens' in sen))
		{
		console.vlog("DEBUGRULE: for some reason tokens is not in the sentence " + JSON.stringify(sen, null, 4))
		throw new Error("DEBUGRULE: for some reason tokens is not in the sentence " + JSON.stringify(sen, null, 4))
		}
*/
	// var sentence = JSON.parse(JSON.stringify(sen))
 
	console.vlog("getRule: sentence: "+text)

	// change tokens 
  	var tokenizer = new natural.RegexpTokenizer({pattern: /[^\%a-zA-Z0-9\-\?]+/});
	
	text = regexpNormalizer(text.toLowerCase())
	var tkns = natural.NGrams.ngrams(tokenizer.tokenize(text), 1)
	
	var sentence = {}
	sentence['tokens'] = []

	_.each(tkns, function(value, key, list){
		sentence['tokens'].push({
			"word": value[0],
//			"lemma": value[0]
			"lemma": (lemmatizer.only_lemmas(value[0]).length > 0 ? lemmatizer.only_lemmas(value[0])[0]: value[0])
			})
	}, this)

	console.vlog("getRule: enrich lemma: "+JSON.stringify(sentence['tokens'], null, 4))
github dadi / api / dadi / lib / search / analysers / standard.js View on Github external
'use strict'

const natural = require('natural')
const TfIdf = natural.TfIdf
const tokenizer = new natural.RegexpTokenizer({
  // pattern: new RegExp(/[^A-Za-zÅåÀÈÌÒÙàèìòùÁÉÍÓÚÝáéíóúýÂÊÎÔÛâêîôûÃÑÕãñõÄËÏÖÜŸäëïöüÿŠŽšžÇç]/i)
  pattern: new RegExp(/[^a-zA-Z\u00C0-\u017F]/i)
})

class StandardAnalyzer {
  constructor (fieldRules) {
    this.fieldRules = fieldRules
    this.tfidf = new TfIdf()
    this.tfidf.setTokenizer(tokenizer)
  }

  add (field, value) {
    if (Array.isArray(value)) {
      let filteredValues = value.filter(this.isValid)
      filteredValues.forEach(val => this.tfidf.addDocument(val, field))
    } else if (this.isValid(value)) {
github unfoldingWord / translationCore / modules / translationNotes_Check_plugin / subcomponents / TargetVerseDisplay.js View on Github external
///TargetVerseDisplay.js//

const api = window.ModuleApi;
const React = api.React;
const ReactBootstrap = api.ReactBootstrap;

var natural = require('natural');
var XRegExp = require('xregexp');
var nonUnicodeLetter = XRegExp('\\PL');

//Wordlength tokenizer
const tokenizer = new natural.RegexpTokenizer({pattern: nonUnicodeLetter});

/* Contains a word from the target language, defines a lot of listeners for clicks */
const TargetWord = React.createClass({
  // highlighted: false,
  getInitialState: function() {
    return {
      highlighted: false,
      wordObj: { // this is required to pass into our callbacks
        'word': this.props.word,
        'key': this.props.keyId
      },

    };
  },

  userClick: function() {
github erelsgl / nlu-server / classifiers.js View on Github external
'REJECT':['no', 'not'],
		'QUERY': ['how', 'about', 'let', 'discuss']
	}

	var wh = ["what", "which", "how"]

	var sample = JSON.parse(JSON.stringify(sample_or)) 

	if ('input' in sample)
		sample = sample.input

	console.vlog("DEBUGSALIENT: text : "+ sample.text)	

	//var attrval = getRule(sample.sentences, sample.text).labels
	var attrval = getRule(sample.text).labels
	var tokenizer = new natural.RegexpTokenizer({pattern: /[^a-zA-Z0-9\-\?]+/});

	text = regexpNormalizer(sample.text.toLowerCase())

	console.vlog("DEBUGSALIENT: normalized: "+text)

	var tkns = _.flatten(natural.NGrams.ngrams(tokenizer.tokenize(text), 1))
	console.vlog("DEBUGSALIENT: tokens: "+tkns)
	  
	 var features_add = {}
	_.each(salient, function(value, key, list){
		var inter = _.intersection(value, tkns).length
		if (inter != 0)
			{
			console.vlog("DEBUGSALIENT: GOT IT " +key+" "+inter)
			features[key] = inter
			}
github erelsgl / nlu-server / research / ppdb / evalmeasure_5ed_grams.js View on Github external
var natural = require('natural');
var utils = require('./utils');
var async = require('async');
var bars = require('../../utils/bars.js');
var partitions = require('limdu/utils/partitions');
var PrecisionRecall = require("limdu/utils/PrecisionRecall");
var truth = require("../rule-based/truth_utils.js")
var truth_filename =  "../../truth_teller/sentence_to_truthteller.txt"
var limdu = require("limdu");
var ftrs = limdu.features;
var rules = require("../rule-based/rules.js")

TfIdf = natural.TfIdf
tfidf = new TfIdf()

var tokenizer = new natural.RegexpTokenizer({pattern: /[^a-zA-Z0-9%'$+-]+/});

function cleanup(sentence)
{
  console.log(sentence)
  sentence = sentence.replace(//g, "")
  sentence = sentence.replace(//g, "")
  sentence = sentence.replace(/\^/g, "")
  sentence = sentence.replace(/\./g, "")
  sentence = sentence.replace(/\!/g, "")
  sentence = sentence.replace(/\$/g, "")
  sentence = sentence.replace(/ +(?= )/g,'')
  sentence = sentence.toLowerCase()
  console.log("\""+sentence+"\"")
  if ((sentence == "") || (sentence == " "))
    sentence = false
  console.log(sentence)
github erelsgl / nlu-server / research / ppdb / utils.js View on Github external
function generatengrams(sentence)
{
	var tokenizer = new natural.RegexpTokenizer({pattern: /[^a-zA-Z0-9%'$+-]+/});
	var words = tokenizer.tokenize(sentence);
	
	var feature = []

	_(3).times(function(n){
		feature = feature.concat(bars.skipgrams(words, n, 3))
	})

	var features = []
	_.each(feature, function(value, key, list){ 
		if (!bars.isstopword(value))
			features.push(value.join(" "))
	}, this)

	features = _.unique(features)
	features = _.sortBy(features, function(num){ return num.length })
github sysrep / time-viz / src / lib / nlpHelpers.js View on Github external
export const getSentences = (textContent) => {
  const tokenizer = new natural.RegexpTokenizer({pattern: /[!?.]/});
  const pureContent = removeSpace(removePuncButPreserveSentences(textContent));
  return tokenizer.tokenize(pureContent);
}
github dadi / api / dadi / lib / model / search.js View on Github external
Search.prototype.tokenise = function(query) {
  const tokeniser = new natural.RegexpTokenizer({
    pattern: new RegExp(/[^a-zA-Z\u00C0-\u017F]/i)
  })

  return tokeniser.tokenize(query).map(word => {
    return word.toLowerCase()
  })
}

natural

General natural language (tokenizing, stemming (English, Russian, Spanish), part-of-speech tagging, sentiment analysis, classification, inflection, phonetics, tfidf, WordNet, jaro-winkler, Levenshtein distance, Dice's Coefficient) facilities for node.

MIT
Latest version published 1 month ago

Package Health Score

98 / 100
Full package analysis