How to use the natural.WordTokenizer function in natural

To help you get started, we’ve selected a few natural examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github machinelearnjs / machinelearnjs / src / lib / feature_extraction / index.repl.ts View on Github external
/* tslint:disable */
import * as natural from 'natural';
const tokenizer = new natural.WordTokenizer();
console.log(tokenizer.tokenize('your do a dog dog has fleas.'));

const NGrams = natural.NGrams;
console.log(NGrams.ngrams('This is a text document to analyze.', 5));

import { CountVectorizer } from './text';

//
const cv = new CountVectorizer();

const text1 = ['deep learning ian good fellow learning jason shin shin', 'yoshua bengio'];

console.log('original text', text1);
const vocabCounts = cv.fit_transform(text1);
console.log(vocabCounts);
console.log(cv.vocabulary);
github ava-ia / core / composers / nlp / nlp.natural.js View on Github external
// -- More info: https://github.com/NaturalNode/natural
'use strict';

import Natural from 'natural';
// -- Internal
const tokenizer = new Natural.WordTokenizer()

export default (phrase) => {
  const time = new Date();
  return new Promise((resolve, reject) => {
    const tokens = tokenizer.tokenize(phrase);
    tokens.map(token => {
      console.log(token, Natural.PorterStemmer.stem(token))
    });

    Natural.LancasterStemmer.attach();
    console.log(phrase.tokenizeAndStem());

    resolve({
      engine: 'compromise',
      ms: (new Date() - time),
github nodejitsu / handbook / index.js View on Github external
'use strict';

var path = require('path'),
    fs = require('fs'),
    natural = require('natural'),
    lunr = require('lunr'),
    tokenizer = new natural.WordTokenizer(),
    loc = path.resolve(__dirname, 'content'),
    scraper = {
      title: /\[meta:title\]:\s<>\s\((.+?)\)(?!\))/,
      description: /\[meta:description\]:\s<>\s\((.+?)\)(?!\))/,
      firstlines: /^((.*\n){2}){1,3}/
    };

//
// ### @private function scrape()
// #### @content {String} document content
// #### @key {String} scraper key
// #### @n {Number} index of match that should be returned
// Scrapes the [key] from the content by Regular Epression
//
function scrape(content, key, n) {
  if (!content) return '';
github googleapis / nodejs-vision / samples / textDetection.js View on Github external
async add(filename, document) {
    const PUNCTUATION = ['.', ',', ':', ''];
    const tokenizer = new natural.WordTokenizer();
    const tokens = tokenizer.tokenize(document);
    // filter out punctuation, then add all tokens to a redis set.
    await Promise.all(
      tokens
        .filter(token => PUNCTUATION.indexOf(token) === -1)
        .map(token => {
          const sadd = promisify(this.tokenClient.sadd).bind(this.tokenClient);
          return sadd(token, filename);
        })
    );
    const set = promisify(this.docsClient.set).bind(this.docsClient);
    await set(filename, document);
  }
github hunterloftis / summarize / lib / summarize.js View on Github external
var _ = require('lodash');
var unfluff = require('unfluff');
var natural = require('natural');
var tokenizer = new natural.WordTokenizer();
var sentiment = require('sentiment');
var Stats = require('text-statistics');
var glossary = require('glossary')({
  minFreq: 2,
  collapse: true,
  verbose: true
});

var DEFAULTS = {
  ok: false,
  sentiment: 0,
  title: null,
  topics: [],
  words: 0,
  difficulty: 0,
  minutes: 0,
github tldr-pages / tldr-node-client / lib / search.js View on Github external
let getTokens = (data) => {
  let tokenizer = new natural.WordTokenizer();
  let tokens = tokenizer.tokenize(data);
  tokens.forEach((word, index) => {
    word = word.toLowerCase();
    word = natural.PorterStemmer.stem(word);
    tokens[index] = word;
  });

  return tokens;
};
github sysrep / time-viz / src / lib / nlpHelpers.js View on Github external
export const getTokensWithOutNumbersAndStopWords = (textContent) => {
  const noNumbers = removeNumbers(textContent)
  const tokenizer = new natural.WordTokenizer();
  const tokens = tokenizer.tokenize(noNumbers);
  const tokensInLowerCase = toLowerCase(tokens);
  const singularizedTokens = getSingularizedWord(tokensInLowerCase);
  return arrayDiff(singularizedTokens, stopwords.english);
}
github pastelsky / bundlephobia / server / middlewares / similar-packages / similarPackages.middleware.js View on Github external
async function getCategory(packageName) {

  if (getInCategoryMap(packageName)) {
    return {
      label: getInCategoryMap(packageName),
      score: 999,
    }
  }

  const { description, keywords } = await getPackageDetails(packageName)
  const tokenizer = new natural.WordTokenizer();
  const tokenString = await stripMarkdown(description) + ' ' + keywords.join(' ')
  const packageTokens =
    tokenizer.tokenize(tokenString)
      .map(token => token.toLowerCase())
      .map(natural.PorterStemmer.stem)
      .concat(
        tokenizer.tokenize(packageName)
          .map(natural.PorterStemmer.stem)
      )

  const scores = {}
  let maxScoreCategory = {
    category: '',
    score: 0,
  }
github brave / vault / src / oip.js View on Github external
'160x600': {}

                                         },
                                         categories: require('../config/sonobi-codes.js')
                                       })
  this.config.oip.options = underscore.defaults(this.config.oip.options || {},
                                               { refillInterval: 2 * 1000,
                                                 retryInterval: 60 * 1000,
                                                 emptyInterval: 60 * 1000,
                                                 maxFlights: 8,
                                                 lowWater: 5,
                                                 highWater: 15
                                               })

  this.pqs = {}
  this.tokenizer = new natural.WordTokenizer()
  underscore.keys(this.config.oip.categories).forEach(category => {
    var trie = new Trie()

    trie.addStrings(this.tokenizer.tokenize(this.config.oip.categories[category]))
    this.pqs[category] = { category: category,
                           name: this.config.oip.categories[category],
                           errors: 0,
                           sizes: {},
                           trie: trie,
                           intents: trie.keysWithPrefix(''),
                           query: { cat: {} }
                         }
    this.pqs[category].query.cat[category] = 24 * 60 * 60
    underscore.keys(this.config.oip.sizes).forEach(size => {
      this.pqs[category].sizes[size] = { queue: new PriorityQ(pqComparator),
                                         lowWater: this.config.oip.options.lowWater,
github zachleat / zachleat.com / .eleventy.js View on Github external
eleventyConfig.addLiquidFilter("getSentimentValue", function(content) {
		if( content ) {
			const tokenizer = new Natural.WordTokenizer();
			return analyze.getSentiment(tokenizer.tokenize(content));
		}

		return 0;
	});

natural

General natural language (tokenizing, stemming (English, Russian, Spanish), part-of-speech tagging, sentiment analysis, classification, inflection, phonetics, tfidf, WordNet, jaro-winkler, Levenshtein distance, Dice's Coefficient) facilities for node.

MIT
Latest version published 1 month ago

Package Health Score

98 / 100
Full package analysis