Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
/* global it */
/* global describe */
const Readable = require('stream').Readable
const logLevel = process.env.LOG_LEVEL || 'error'
const s = new Readable({ objectMode: true })
const sandboxPath = 'test/sandbox'
const should = require('should')
const stopwords = require('stopword').en
s.push({
id: 'a',
title: 'The Beatles',
content: 'The Beatles were an English rock band, formed in Liverpool in 1960. Beatles from Liverpool',
year: ['1960', '1961', '1962']
})
s.push({
id: 'b',
title: 'The Rolling Stones',
content: 'The Rolling Stones are an English rock band formed in London in 1962.',
year: ['1962', '1963', '1964']
})
s.push({
id: 'c',
title: 'Pink Floyd',
const textItems = this.$store.state.leotext
children.forEach(child => {
const t = textItems[child.t]
let textData = {}
try {
textData = JSON.parse(t)
} catch (e) {
console.log(e, child.id)
}
items.push(_.get(textData, this.from, ''))
})
let text = items.join()
// text = stripchar.RSspecChar(text.toLowerCase())
text = text.replace(/[[\]&,;'"”’().*?]/g, ' ')
let words = split(text)
words = sw.removeStopwords(words)
const wf = {}
_.remove(words, word => /\d/.test(word))
words.forEach(word => {
if (word.length < 4) { return }
word = word.toLowerCase()
wf[word] = wf[word] ? wf[word] + 1 : 1
})
// debugger
const wordFreq = {}
Object.keys(wf).forEach(k => {
const v = wf[k]
if (v > this.threshold) wordFreq[k] = v
})
const keys = Object.keys(wordFreq)
keys.forEach(k => {
if (wordFreq[k + 's']) {
.splice(2)
.join("/")
.replace(/\.[^/.]+$/, "");
//Remove _index + index files from uri
const compUriArray = item.uri.split("/");
const lastItemInCompArray = compUriArray[compUriArray.length - 1];
if (
lastItemInCompArray.includes("index") ||
lastItemInCompArray.includes("_index")
) {
compUriArray.pop();
item.uri = compUriArray.join("/");
}
let content = stopword
.removeStopwords(meta.content.split(/\s+/))
.join(" ")
.replace(/\W/g, " ")
.trim();
let truncatedContent = truncate(content, _this.contentSize); // 20kB limit
item.content = truncatedContent;
// If this is a partial index, remove everything but the props we want
if (self.partial) {
item = _.pick(item, self.customInd);
}
// Include an objectID to prevent duplicated entries in the index.
item.objectID = meta.data.objectID
? meta.data.objectID
: item.uri
const withoutStopWords = str => {
// turn the string into an array of words
const arr = strToArray(str);
// filter out any words that are considered stop words
const cleaned = stopword.removeStopwords(arr);
// join the array back into a string
const joined = cleaned.join(' ');
// return the string
return joined;
};
export const withoutStopWords = (str: string): string => {
// turn the string into an array of words
const arr = strToArray(str);
// filter out any words that are considered stop words
const cleaned = stopword.removeStopwords(arr);
// join the array back into a string
const joined = cleaned.join(' ');
// return the string
return joined;
};
export const withoutStopWords = (str) => {
// turn the string into an array of words
const arr = strToArray(str);
// filter out any words that are considered stop words
const cleaned = stopword.removeStopwords(arr);
// join the array back into a string
const joined = cleaned.join(' ');
// return the string
return joined;
};
import stopword from 'stopword'
import pipeline from './search-index-pipeline'
import { convertMetaDocId } from 'src/activity-logger'
import { RESULT_TYPES } from 'src/overview/constants'
const indexOpts = {
batchSize: 500,
appendOnly: false,
indexPath: 'worldbrain-index',
logLevel: 'info',
preserveCase: false,
compositeField: false,
nGramLength: 1,
// separator: /[|' .,\-|(\n)]+/,
stopwords: stopword.en,
fieldOptions: {
// The `domain.tld(.cctld)` data from a page's URL
// Currently used to afford `domain.tld(.cctld)` search in our queries
// Should never need to tokenize, but put forward-slash separator incase preproecssing fails for whatever reason
// (then domain search can still happen)
domain: {
weight: 40,
fieldedSearch: true,
separator: '/',
},
// Page title text; occasionally empty
title: {
weight: 30,
fieldedSearch: true,
},
// Page URL tokenized by forward slashes; normalized slightly to remove protocol and leading `www.`
private preprocess(text: string, { removeSW = false }): string[] {
const tokenizer = new WordTokenizer();
let tokens = text.split(' ');
if (removeSW) {
tokens = sw.removeStopwords(tokens, ENGLISH_STOP_WORDS);
}
return tokenizer.tokenize(tokens.join(' '));
}
}
exports.getVector = function(text, options) {
if (typeof text != "string")
throw new Error("error: input must be a string");
var defaults = {
nGramLength: 1,
separator: /[\|' \.,\-|(\n)]+/,
stopwords: sw.getStopwords()
}
options = _.defaults(options || {}, defaults)
if (options.nGramLength == 0)
throw new Error("error: nGramLength must be greater than 0");
//tokenise string, remove stopwords
var tokens = sw.removeStopwords(text, {
inputSeparator: options.separator,
stopwords: options.stopwords
}).split(' ');
var vec = []
if (!isNaN(options.nGramLength)) {
return getTermVectorForNgramLength(tokens, options.nGramLength);
}
else if (options.nGramLength.constructor === Array) {
for (var i = 0; i < options.nGramLength.length; i++)
vec = vec.concat(getTermVectorForNgramLength(tokens, options.nGramLength[i]))
return vec;
}
else if (typeof(options.nGramLength)
&& (parseInt(options.nGramLength.gte) <= parseInt(options.nGramLength.lte))) {
var j = parseInt(options.nGramLength.gte);
while (j <= options.nGramLength.lte) {
exports.getVector = function(text, options) {
if (typeof text != "string")
throw new Error("error: input must be a string");
var defaults = {
nGramLength: 1,
separator: /[\|' \.,\-|(\n)]+/,
stopwords: sw.getStopwords()
}
options = _.defaults(options || {}, defaults)
if (options.nGramLength == 0)
throw new Error("error: nGramLength must be greater than 0");
//tokenise string, remove stopwords
var tokens = sw.removeStopwords(text, {
inputSeparator: options.separator,
stopwords: options.stopwords
}).split(' ');
var vec = []
if (!isNaN(options.nGramLength)) {
return getTermVectorForNgramLength(tokens, options.nGramLength);
}
else if (options.nGramLength.constructor === Array) {
for (var i = 0; i < options.nGramLength.length; i++)
vec = vec.concat(getTermVectorForNgramLength(tokens, options.nGramLength[i]))