Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
var lemmatizer = new Lemmatizer();
var antonyms = {}
//var data = fs.readFileSync("./antonyms.txt", 'utf8').split("\n")
/*_.each(data, function(value, key, list){
var value1 = value.split(",")
antonyms[value1[0]] = value1[1]
antonyms[value1[1]] = value1[0]
}, this)
*/
var old_unused_tokenizer = {tokenize: function(sentence) { return sentence.split(/[ \t,;:.!?]/).filter(function(a){return !!a}); }}
var tokenizer = new natural.RegexpTokenizer({pattern: /[^a-zA-Z0-9\-\?]+/});
console.vlog = function(data) { fs.appendFileSync(log_file, data + '\n', 'utf8') };
// var tokenizer = new natural.WordTokenizer({'pattern':(/(\W+|\%)/)}); // WordTokenizer, TreebankWordTokenizer, WordPunctTokenizer
// var ngrams = new natural.NGrams.ngrams()
// var enhance = function (classifierType, featureExtractor, inputSplitter, featureLookupTable, labelLookupTable, preProcessor, postProcessor, TestSplitLabel, multiplyFeaturesByIDF, featureExpansion, featureExpansionScale, featureExpansionPhrase, featureFine, expansionParam) {
var enhance = function (classifierType, featureExtractor, inputSplitter, featureLookupTable, labelLookupTable, preProcessor, postProcessor, TestSplitLabel, multiplyFeaturesByIDF, featureOptions) {
// var enhance = function (classifierType, featureLookupTable, labelLookupTable) {
return classifiers.EnhancedClassifier.bind(0, {
normalizer: normalizer,
inputSplitter: inputSplitter,
featureOptions:featureOptions,
// featureExpansion: featureExpansion,
// Sample data for these examples (coerced to strings):
var economy = ' ' + fs.readFileSync('data/texts/economy.txt');
var politics = ' ' + fs.readFileSync('data/texts/politics.txt');
var sports = ' ' + fs.readFileSync('data/texts/sports.txt');
var natural = require('natural'),
tokenizer = new natural.WordTokenizer();
console.log("\n-- Tokenized sample text in politics.txt:");
console.log(tokenizer.tokenize(politics));
console.log("\n-- Use Porter Stemmer on a single word:");
console.log(natural.PorterStemmer.stem("dogs"));
natural.PorterStemmer.attach(); // add methods to string
console.log("\n-- Use Porter Stemmer text in file sports.txt:");
console.log(sports.tokenizeAndStem());
console.log("dog dogs Dog dogged".tokenizeAndStem());
var classifier = new natural.BayesClassifier();
classifier.addDocument(economy, 'economy');
classifier.addDocument(politics, 'politics');
classifier.addDocument(sports, 'sports');
classifier.train();
console.log("\n-- Bayesian classifier test results:");
console.log(classifier.classify('The President and Congress went on vacation.'));
function createHashedIndexFiles(lang, indexPath, indexData, type) {
var words_to_stem = {};
var stem_to_words = {};
var stemmer = null;
switch (lang) {
case 'eng':
stemmer = natural.PorterStemmer;
break;
case 'esp':
stemmer = natural.PorterStemmerEs;
break;
}
//console.log('trying to create index', stemmer);
if (type == 'words' && stemmer != null) {
// make stems
for (var key in indexData) {
var wordData = indexData[key],
stemmedWord = stemmer.stem(key);
return new Promise((resolve, reject) => {
const tokens = tokenizer.tokenize(phrase);
tokens.map(token => {
console.log(token, Natural.PorterStemmer.stem(token))
});
Natural.LancasterStemmer.attach();
console.log(phrase.tokenizeAndStem());
resolve({
engine: 'compromise',
ms: (new Date() - time),
tokens: tokenizer.tokenize(phrase),
stemmers: Natural.PorterStemmer.stem(phrase)
// glossary: glossary.parse(phrase),
// sentiment: analyser.classify(phrase),
});
});
};
/* tslint:disable */
import * as natural from 'natural';
const tokenizer = new natural.WordTokenizer();
console.log(tokenizer.tokenize('your do a dog dog has fleas.'));
const NGrams = natural.NGrams;
console.log(NGrams.ngrams('This is a text document to analyze.', 5));
import { CountVectorizer } from './text';
//
const cv = new CountVectorizer();
const text1 = ['deep learning ian good fellow learning jason shin shin', 'yoshua bengio'];
console.log('original text', text1);
const vocabCounts = cv.fit_transform(text1);
console.log(vocabCounts);
console.log(cv.vocabulary);
// -- More info: https://github.com/NaturalNode/natural
'use strict';
import Natural from 'natural';
// -- Internal
const tokenizer = new Natural.WordTokenizer()
export default (phrase) => {
const time = new Date();
return new Promise((resolve, reject) => {
const tokens = tokenizer.tokenize(phrase);
tokens.map(token => {
console.log(token, Natural.PorterStemmer.stem(token))
});
Natural.LancasterStemmer.attach();
console.log(phrase.tokenizeAndStem());
resolve({
engine: 'compromise',
ms: (new Date() - time),
function listen() {
const host = server.address().address;
const port = server.address().port;
console.log('Example app listening at http://' + host + ':' + port);
}
// Do we already have a classifier "database"
const exists = fs.existsSync('classifier.json');
// If we do, load it
if (exists) {
natural.BayesClassifier.load('classifier.json', null, loaded);
// If not make a new one
} else {
console.log('starting a new classifier');
classifier = new natural.BayesClassifier();
}
// All set and loaded
function loaded(err, cf) {
classifier = cf;
console.log('Classifier loaded');
}
// This is a post for training
app.post('/train', training);
function training(req, res) {
// Get the text and category
const text = req.body.text;
const category = req.body.category;
var natural = require('natural')
var csv = require('fast-csv')
var fs = require('fs')
var path = require('path')
var jsonfile = require('jsonfile')
natural.LancasterStemmer.attach();
var query_classify = new natural.BayesClassifier();
var both_streams_ended = false;
var NGrams = natural.NGrams;
console.log('Non-queries');
var stream = fs.createReadStream(path.resolve('./plugins/training', 'conv_bot_db_augmented.csv'))
.pipe(csv.parse())
.on('readable', function(){
var row;
while(null !== (row = stream.read())){
// console.log('unstemmed:',row);
// console.log('stemmed:',)
// console.log('->added document:',row[0])
var bgrams1 = NGrams.ngrams(row[0].toLowerCase(), 2, null, '[end]')
// var bgrams1 = NGrams.bigrams(row[0].toLowerCase());
for (var i = 0; i < bgrams1.length; i++) {
query_classify.addDocument(bgrams1[i], 'no');
};
if (!('sentences' in sample))
throw new Error("for some reason sentences not in sample")
if (!_.isArray(sample['sentences']))
sample['sentences'] = [sample['sentences']]
var tokens = _.compact(_.flatten(_.pluck(sample['sentences'], 'tokens')))
var words = []
_.each(tokens, function(token, key, list){
words.push(token.word.toLowerCase())
}, this)
// var feature = natural.NGrams.ngrams(words, 1).concat(natural.NGrams.ngrams(words, 2))
var feature = natural.NGrams.ngrams(words, 1)
_.each(feature, function(value, key, list){
features[value] = 1
}, this)
console.log("feAsyncPrimitive: train: "+train+" FEATURES: "+JSON.stringify(features, null, 4))
callback(null, features)
}
if ("input" in sample)
sample = sample.input
/* if (!('basic-dependencies' in sample['sentences']))
throw new Error("train:"+train+" basic-dependencies not in the sample "+JSON.stringify(sample))
*/
/* if (!('sentences' in sample))
throw new Error("for some reason sentences not in sample "+JSON.stringify(sample))
*/
/* if (!('tokens' in sample['sentences']))
throw new Error("for some reason tokens not in sample"+JSON.stringify(sample, null, 4))
*/
if (_.isArray(sample['sentences']))
throw new Error("feAsync is only for object sentences")
var tokenizer = new natural.RegexpTokenizer({pattern: /[^\%a-zA-Z0-9\-\?]+/});
var text = regexpNormalizer(sample["text"].toLowerCase())
console.vlog("feAsyncStanford: text: "+text)
// the array of tokens
// var tokenized = tokenizer.tokenize(text)
// console.vlog("feAsyncStanford: tokenized: "+JSON.stringify(tokenized, null, 4))
// sample['sentences'] = {"tokens":[]}
// _.each(tokenized, function(value, key, list){
// sample['sentences']['tokens'].push({
// "word": value,
// // "lemma": value[0]
// "lemma": natural.PorterStemmer.stem(value)
// // "lemma": lemmerEng.lemmatize(value[0])
// })