How to use the unidecode.unidecode function in Unidecode

To help you get started, we’ve selected a few Unidecode examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github huseinzol05 / Python-DevOps / basic / 1.autopep8 / malaya / main.py View on Github external
def textcleaning(string):
    string = re.sub(r'http\S+|www.\S+', '', ' '.join(
        [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]))
    string = unidecode(string).replace('.', '. ')
    string = string.replace(',', ', ')
    string = re.sub(r'[^\'\"A-Za-z\- ]+', '', unidecode(string))
    string = [
        y.strip() for y in word_tokenize(
            string.lower()) if isWord(
            y.strip())]
    string = [y for y in string if all(
        [y.find(k) < 0 for k in list_laughing]) and y[:len(y) // 2] != y[len(y) // 2:]]
    string = ' '.join(string).lower()
    string = (''.join(''.join(s)[:2]
                      for _, s in itertools.groupby(string))).split()
    return ' '.join([y for y in string if y not in STOPWORDS])
github Wordseer / wordseer / app / corenlp / corenlp.py View on Github external
def parse_parser_results(text):
    """ This is the nasty bit of code to interact with the command-line
    interface of the CoreNLP tools.  Takes a string of the parser results
    and then returns a Python list of dictionaries, one for each parsed
    sentence.
    """
    results = {"sentences": []}
    state = STATE_START
    lines = unidecode(text.decode('utf-8')).split("\n")
    for index, line in enumerate(lines):
        line = line.strip()

        if line.startswith("Sentence #"):
            sentence = {'words': [], 'parsetree': [], 'dependencies': []}
            results["sentences"].append(sentence)
            state = STATE_TEXT

        elif state == STATE_TEXT:
            sentence['text'] = remove_escapes(line)
            state = STATE_WORDS

        elif state == STATE_WORDS:
            if not line.startswith("[Text="):
                raise ParserError('Parse error. Could not find "[Text=" in: %s' % line)
            for s in WORD_PATTERN.findall(line):
github ekansa / open-context-py / opencontext_py / apps / exports / exptables / templating.py View on Github external
"""
        consolidated_ids = []
        consolidated_tuple_list = []
        look_person_list = raw_person_list
        last_count = self.exp_tab.row_count
        for raw_person in raw_person_list:
            # print('ID consolidated: ' + str(len(consolidated_ids)))
            if 'count' in raw_person:
                count = float(raw_person['count'])
            else:
                count = last_count
            last_count = count
            act_name = raw_person['label']
            if not isinstance(act_name, str):
                act_name = '[Not named]'
            act_uniname = unidecode(act_name)
            act_id = raw_person['id']
            for look_person in look_person_list:
                look_id = look_person['id']
                if not isinstance(look_person['label'], str):
                    look_person['label'] = '[Not named]'
                if look_id != act_id and \
                   look_id not in consolidated_ids and \
                   (act_name == look_person['label'] or act_uniname == unidecode(look_person['label'])):
                    # same name but different record for a person,
                    # lets consolidate it
                    consolidated_ids.append(look_person['id'])
                    count += float(look_person['count'])
            if act_id not in consolidated_ids:
                # print('Adding ' + str(unidecode(act_name)))
                person_tuple = (act_name, count)
                consolidated_tuple_list.append(person_tuple)
github spro / char-rnn.pytorch / helpers.py View on Github external
def read_file(filename):
    file = unidecode.unidecode(open(filename).read())
    return file, len(file)
github maximilianh / pubMunch / lib / pubCrawlLib-broken.py View on Github external
def writeDocIdStatus(outDir, pmid, msg, longMsg, details=None):
    """ append a line to doc status file in outDir """
    fname = join(outDir, PMIDSTATNAME)
    if isfile(fname):
        outFh = open(fname, 'a')
    else:
        outFh = open(fname, 'w')
    row = [str(pmid), unidecode.unidecode(msg)]
    if longMsg is not None:
        row.append(unidecode.unidecode(longMsg))
    if details is not None:
        row.append(unidecode.unidecode(details))
    row = [ x.encode('utf8') for x in row ]
    logging.info('Document status (pmid, logType, desc, details): %s' % ','.join(row))
    outFh.write('\t'.join(row))
    outFh.write('\n')
    return
github project-alice-assistant / ProjectAliceSkills / Tools / JsonValidator / src / DialogValidation.py View on Github external
def searchMissingSlotValues(values: list, slot: dict) -> list:
		if slot['automaticallyExtensible']:
			return list()

		allValues = list()
		for slotValue in slot['values']:
			allValues.append(unidecode(slotValue['value']).lower())
			allValues.extend([unidecode(x).lower() for x in slotValue.get('synonyms', list())])

		return [value for value in values if unidecode(value).lower() not in allValues]
github vrypan / bucket3 / bucket3 / b3tools.py View on Github external
def slugify(str):
    # Credit: http://stackoverflow.com/a/8366771
    str = urllib.parse.unquote(str)
    str = unidecode.unidecode(str).lower()
    ret = re.sub(r'\W+','-',str)
    return ret
github Pinafore / qb / bin / run_clm.py View on Github external
for corpus, qb, wiki, source in [("wiki", False, True, False),
                                     ("qb", True, False, False),
                                     ("source", False, False, True)
                                     ]:
        # Add training data
        start = time.time()
        for title, text in text_iterator(wiki, flags.wiki_location,
                                         qb, flags.question_db,
                                         source, flags.source_location,
                                         flags.max_pages,
                                         min_pages=min_answers):
            norm_title = lm.normalize_title(corpus, title)
            doc_num += 1
            if doc_num % 500 == 0 or time.time() - start > 10:
                print("Adding train doc %i, %s (%s)" %
                      (doc_num, unidecode(title), corpus))
                start = time.time()
            lm.add_train(norm_title, text)
            lm.add_train("compare_%i" % lm.compare(norm_title), text)

    print("Done training")
    if flags.lm_out:
        # Create the extractor object and write out the pickle
        o = open(flags.lm_out, 'w')
        lm.write_lm(o)
github voteview / WebVoteView / model / slugify.py View on Github external
def slugify(text):
	text = unidecode.unidecode(text).lower()
	text = linearName(text)
	text = re.sub(r"[^a-z0-9]+","-",text).strip()
	text = re.sub(r"[-]+","-",text)
	text = re.sub(r"[-]$","",text)
	return text
github r-anime / holo / src / data / database.py View on Github external
def _alphanum_convert(s):
	#TODO: punctuation is important for some shows to distinguish between seasons (ex. K-On! and K-On!!)
	# 6/28/16: The purpose of this function is weak collation; use of punctuation to distinguish between seasons can be done later when handling multiple found shows.

	# Characters to words
	s = s.replace("&", "and")
	# Japanese romanization differences
	s = _romanization_o.sub("o", s)
	s = s.replace("uu", "u")
	s = s.replace("wo", "o")

	s = _alphanum_regex.sub("", s)
	s = s.lower()
	return unidecode(s)

Unidecode

ASCII transliterations of Unicode text

GPL-2.0
Latest version published 10 months ago

Package Health Score

84 / 100
Full package analysis