Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def textcleaning(string):
string = re.sub(r'http\S+|www.\S+', '', ' '.join(
[i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]))
string = unidecode(string).replace('.', '. ')
string = string.replace(',', ', ')
string = re.sub(r'[^\'\"A-Za-z\- ]+', '', unidecode(string))
string = [
y.strip() for y in word_tokenize(
string.lower()) if isWord(
y.strip())]
string = [y for y in string if all(
[y.find(k) < 0 for k in list_laughing]) and y[:len(y) // 2] != y[len(y) // 2:]]
string = ' '.join(string).lower()
string = (''.join(''.join(s)[:2]
for _, s in itertools.groupby(string))).split()
return ' '.join([y for y in string if y not in STOPWORDS])
o = self.unidecode(s)
self.assertEqual('THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG 1234567890', o)
def test_enclosed_alphanumerics(self):
self.assertEqual(
'aA20(20)20.20100',
self.unidecode(_u('ⓐⒶ⑳⒇⒛⓴⓾⓿')),
)
class TestUnidecode(BaseTestUnidecode, unittest.TestCase):
unidecode = staticmethod(unidecode)
class TestUnidecodeExpectASCII(BaseTestUnidecode, unittest.TestCase):
unidecode = staticmethod(unidecode_expect_ascii)
class TestUnidecodeExpectNonASCII(BaseTestUnidecode, unittest.TestCase):
unidecode = staticmethod(unidecode_expect_nonascii)
if __name__ == "__main__":
unittest.main()
def test_enclosed_alphanumerics(self):
self.assertEqual(
'aA20(20)20.20100',
self.unidecode(_u('ⓐⒶ⑳⒇⒛⓴⓾⓿')),
)
class TestUnidecode(BaseTestUnidecode, unittest.TestCase):
unidecode = staticmethod(unidecode)
class TestUnidecodeExpectASCII(BaseTestUnidecode, unittest.TestCase):
unidecode = staticmethod(unidecode_expect_ascii)
class TestUnidecodeExpectNonASCII(BaseTestUnidecode, unittest.TestCase):
unidecode = staticmethod(unidecode_expect_nonascii)
if __name__ == "__main__":
unittest.main()
def parse_parser_results(text):
""" This is the nasty bit of code to interact with the command-line
interface of the CoreNLP tools. Takes a string of the parser results
and then returns a Python list of dictionaries, one for each parsed
sentence.
"""
results = {"sentences": []}
state = STATE_START
lines = unidecode(text.decode('utf-8')).split("\n")
for index, line in enumerate(lines):
line = line.strip()
if line.startswith("Sentence #"):
sentence = {'words': [], 'parsetree': [], 'dependencies': []}
results["sentences"].append(sentence)
state = STATE_TEXT
elif state == STATE_TEXT:
sentence['text'] = remove_escapes(line)
state = STATE_WORDS
elif state == STATE_WORDS:
if not line.startswith("[Text="):
raise ParserError('Parse error. Could not find "[Text=" in: %s' % line)
for s in WORD_PATTERN.findall(line):
"""
consolidated_ids = []
consolidated_tuple_list = []
look_person_list = raw_person_list
last_count = self.exp_tab.row_count
for raw_person in raw_person_list:
# print('ID consolidated: ' + str(len(consolidated_ids)))
if 'count' in raw_person:
count = float(raw_person['count'])
else:
count = last_count
last_count = count
act_name = raw_person['label']
if not isinstance(act_name, str):
act_name = '[Not named]'
act_uniname = unidecode(act_name)
act_id = raw_person['id']
for look_person in look_person_list:
look_id = look_person['id']
if not isinstance(look_person['label'], str):
look_person['label'] = '[Not named]'
if look_id != act_id and \
look_id not in consolidated_ids and \
(act_name == look_person['label'] or act_uniname == unidecode(look_person['label'])):
# same name but different record for a person,
# lets consolidate it
consolidated_ids.append(look_person['id'])
count += float(look_person['count'])
if act_id not in consolidated_ids:
# print('Adding ' + str(unidecode(act_name)))
person_tuple = (act_name, count)
consolidated_tuple_list.append(person_tuple)
def read_file(filename):
file = unidecode.unidecode(open(filename).read())
return file, len(file)
def writeDocIdStatus(outDir, pmid, msg, longMsg, details=None):
""" append a line to doc status file in outDir """
fname = join(outDir, PMIDSTATNAME)
if isfile(fname):
outFh = open(fname, 'a')
else:
outFh = open(fname, 'w')
row = [str(pmid), unidecode.unidecode(msg)]
if longMsg is not None:
row.append(unidecode.unidecode(longMsg))
if details is not None:
row.append(unidecode.unidecode(details))
row = [ x.encode('utf8') for x in row ]
logging.info('Document status (pmid, logType, desc, details): %s' % ','.join(row))
outFh.write('\t'.join(row))
outFh.write('\n')
return
def searchMissingSlotValues(values: list, slot: dict) -> list:
if slot['automaticallyExtensible']:
return list()
allValues = list()
for slotValue in slot['values']:
allValues.append(unidecode(slotValue['value']).lower())
allValues.extend([unidecode(x).lower() for x in slotValue.get('synonyms', list())])
return [value for value in values if unidecode(value).lower() not in allValues]
def slugify(str):
# Credit: http://stackoverflow.com/a/8366771
str = urllib.parse.unquote(str)
str = unidecode.unidecode(str).lower()
ret = re.sub(r'\W+','-',str)
return ret
for corpus, qb, wiki, source in [("wiki", False, True, False),
("qb", True, False, False),
("source", False, False, True)
]:
# Add training data
start = time.time()
for title, text in text_iterator(wiki, flags.wiki_location,
qb, flags.question_db,
source, flags.source_location,
flags.max_pages,
min_pages=min_answers):
norm_title = lm.normalize_title(corpus, title)
doc_num += 1
if doc_num % 500 == 0 or time.time() - start > 10:
print("Adding train doc %i, %s (%s)" %
(doc_num, unidecode(title), corpus))
start = time.time()
lm.add_train(norm_title, text)
lm.add_train("compare_%i" % lm.compare(norm_title), text)
print("Done training")
if flags.lm_out:
# Create the extractor object and write out the pickle
o = open(flags.lm_out, 'w')
lm.write_lm(o)