Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import { BaseStemmer } from '@nlpjs/core/src'
import { StemmerCa } from '@nlpjs/lang-ca/src'
import { StemmerEn } from '@nlpjs/lang-en/src'
import { StemmerEs } from '@nlpjs/lang-es/src'
import { StemmerPt } from '@nlpjs/lang-pt/src'
import { StemmerPl } from './stemmers/polish-stemmer'
// see https://github.com/axa-group/nlp.js/blob/HEAD/docs/language-support.md
// and https://stackoverflow.com/a/11210358/145289
// snowball algorithm inspired from https://github.com/MihaiValentin/lunr-languages, based on
// https://github.com/fortnightlabs/snowball-js/blob/master/stemmer/src/ext/SpanishStemmer.js based on
// java version at http://snowball.tartarus.org/download.html
export const stemmers: { [key: string]: BaseStemmer } = {
ca: new StemmerCa(),
en: new StemmerEn(),
es: new StemmerEs(),
pl: new StemmerPl(),
pt: new StemmerPt(),
//node-nlp does not support polish
}
export function stemmerFor(locale: string): BaseStemmer {
const stem = stemmers[locale]
if (!stem) {
throw new Error(`No stemmer configured for locale '${locale}'`)
}
return stem
}
return this.trim(normalized.split(/[^a-zA-Zá-úÁ-ÚñÑüÜ]+/))
}
private trim(arr: string[]): string[] {
while (arr[arr.length - 1] === '') {
arr.pop()
}
while (arr[0] === '') {
arr.shift()
}
return arr
}
}
const tokenizers: { [locale: string]: Tokenizer } = {
es: new TokenizerEs(),
en: new TokenizerEn(),
ca: new TokenizerCa(),
pl: new Tokenizer(),
pt: new TokenizerPt(),
}
export function tokenizerPerLocale(locale: Locale): Tokenizer {
return tokenizers[locale]
}
export const DEFAULT_SEPARATORS = ';,./()!?" '
export const DEFAULT_SEPARATORS_REGEX = new RegExp(
'[' + DEFAULT_SEPARATORS + ']',
'g'
)
export const DEFAULT_NOT_SEPARATORS_REGEX = new RegExp(
'[^' + DEFAULT_SEPARATORS + ']',