How to use the stopword.removeStopwords function in stopword

To help you get started, we’ve selected a few stopword examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github kaleguy / leovue / src / components / WordCloud.vue View on Github external
const textItems = this.$store.state.leotext
        children.forEach(child => {
          const t = textItems[child.t]
          let textData = {}
          try {
            textData = JSON.parse(t)
          } catch (e) {
            console.log(e, child.id)
          }
          items.push(_.get(textData, this.from, ''))
        })
        let text = items.join()
        // text = stripchar.RSspecChar(text.toLowerCase())
        text = text.replace(/[[\]&,;'"”’().*?]/g, ' ')
        let words = split(text)
        words = sw.removeStopwords(words)
        const wf = {}
        _.remove(words, word => /\d/.test(word))
        words.forEach(word => {
          if (word.length < 4) { return }
          word = word.toLowerCase()
          wf[word] = wf[word] ? wf[word] + 1 : 1
        })
        // debugger
        const wordFreq = {}
        Object.keys(wf).forEach(k => {
          const v = wf[k]
          if (v > this.threshold) wordFreq[k] = v
        })
        const keys = Object.keys(wordFreq)
        keys.forEach(k => {
          if (wordFreq[k + 's']) {
github replicatedhq / hugo-algolia / lib / index.js View on Github external
.splice(2)
              .join("/")
              .replace(/\.[^/.]+$/, "");

        //Remove _index + index files from uri
        const compUriArray = item.uri.split("/");
        const lastItemInCompArray = compUriArray[compUriArray.length - 1];
        if (
          lastItemInCompArray.includes("index") ||
          lastItemInCompArray.includes("_index")
        ) {
          compUriArray.pop();
          item.uri = compUriArray.join("/");
        }

        let content = stopword
          .removeStopwords(meta.content.split(/\s+/))
          .join(" ")
          .replace(/\W/g, " ")
          .trim();
        let truncatedContent = truncate(content, _this.contentSize); // 20kB limit
        item.content = truncatedContent;

        // If this is a partial index, remove everything but the props we want
        if (self.partial) {
          item = _.pick(item, self.customInd);
        }
        
        // Include an objectID to prevent duplicated entries in the index.
        item.objectID = meta.data.objectID
          ? meta.data.objectID
          : item.uri
github withspectrum / spectrum / iris / migrations / 20171208223206-index-messages-for-search.js View on Github external
const withoutStopWords = str => {
  // turn the string into an array of words
  const arr = strToArray(str);
  // filter out any words that are considered stop words
  const cleaned = stopword.removeStopwords(arr);
  // join the array back into a string
  const joined = cleaned.join(' ');
  // return the string
  return joined;
};
github withspectrum / spectrum / vulcan / utils / text-parsing.js View on Github external
export const withoutStopWords = (str: string): string => {
  // turn the string into an array of words
  const arr = strToArray(str);
  // filter out any words that are considered stop words
  const cleaned = stopword.removeStopwords(arr);
  // join the array back into a string
  const joined = cleaned.join(' ');
  // return the string
  return joined;
};
github specfm / spec-next / servers / search / lib / utils / text-parsing.ts View on Github external
export const withoutStopWords = (str) => {
  // turn the string into an array of words
  const arr = strToArray(str);
  // filter out any words that are considered stop words
  const cleaned = stopword.removeStopwords(arr);
  // join the array back into a string
  const joined = cleaned.join(' ');
  // return the string
  return joined;
};
github machinelearnjs / machinelearnjs / src / lib / feature_extraction / text.ts View on Github external
private preprocess(text: string, { removeSW = false }): string[] {
    const tokenizer = new WordTokenizer();
    let tokens = text.split(' ');
    if (removeSW) {
      tokens = sw.removeStopwords(tokens, ENGLISH_STOP_WORDS);
    }
    return tokenizer.tokenize(tokens.join(' '));
  }
}
github fergiemcdowall / term-vector / lib / term-vector.js View on Github external
exports.getVector = function(text, options) {
  if (typeof text != "string")
    throw new Error("error: input must be a string");
  var defaults = {
    nGramLength: 1,
    separator: /[\|' \.,\-|(\n)]+/,
    stopwords: sw.getStopwords()
  }
  options = _.defaults(options || {}, defaults)
  if (options.nGramLength == 0)
    throw new Error("error: nGramLength must be greater than 0");
  //tokenise string, remove stopwords
  var tokens = sw.removeStopwords(text, {
    inputSeparator: options.separator,
    stopwords: options.stopwords
  }).split(' ');
  var vec = []
  if (!isNaN(options.nGramLength)) {
    return getTermVectorForNgramLength(tokens, options.nGramLength);
  }
  else if (options.nGramLength.constructor === Array) {
    for (var i = 0; i < options.nGramLength.length; i++)
      vec = vec.concat(getTermVectorForNgramLength(tokens, options.nGramLength[i]))
    return vec;
  }
  else if (typeof(options.nGramLength)
           && (parseInt(options.nGramLength.gte) <= parseInt(options.nGramLength.lte))) {
    var j = parseInt(options.nGramLength.gte);
    while (j <= options.nGramLength.lte) {
github sciencefair-land / sciencefair / app / client / views / detail_multi_terms.js View on Github external
const terms = uniqBy(papers, 'key').map(paper => {
    const title = cleanTerms(paper.title ? paper.title : '')
    const abstract = cleanTerms(paper.abstract ? paper.abstract : '')

    let termset = uniq(title.concat(abstract))
    termset = stopword.removeStopwords(termset)
    termset = stopword.removeStopwords(termset, stopwords)
    return termset.map(term => {
      if (term === 'cells') return 'cell'
      if (term === 'genes') return 'gene'
      return term
    })
  })
github sciencefair-land / sciencefair / app / client / views / detail_multi_terms.js View on Github external
const terms = uniqBy(papers, 'key').map(paper => {
    const title = cleanTerms(paper.title ? paper.title : '')
    const abstract = cleanTerms(paper.abstract ? paper.abstract : '')

    let termset = uniq(title.concat(abstract))
    termset = stopword.removeStopwords(termset)
    termset = stopword.removeStopwords(termset, stopwords)
    return termset.map(term => {
      if (term === 'cells') return 'cell'
      if (term === 'genes') return 'gene'
      return term
    })
  })

stopword

A module for node.js and the browser that takes in text and returns text that is stripped of stopwords. Has pre-defined stopword lists for 62 languages and also takes lists with custom stopwords as input.

MIT
Latest version published 16 days ago

Package Health Score

84 / 100
Full package analysis