How to use the commoncode.text.toascii function in commoncode

To help you get started, we’ve selected a few commoncode examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github nexB / scancode-toolkit / tests / commoncode / test_text.py View on Github external
def test_toascii_works_with_empty_unicode_or_bytes():
    assert u'' == text.toascii(b'', translit=False)
    assert u'' == text.toascii(u'', translit=True)
    assert u'' == text.toascii(b'', translit=False)
    assert u'' == text.toascii(u'', translit=True)
github nexB / scancode-toolkit / src / cluecode / copyrights.py View on Github external
line = remove_punctuation(u' ', line)

    # normalize spaces around commas
    line = line.replace(u' , ', u', ')

    # remove ASCII "line decorations"
    # such as in --- or === or !!! or *****
    line = remove_ascii_decorations(u' ', line)

    # in apache'>Copyright replace ">" by "> "
    line = line.replace(u'>', u'> ').replace(u'<', u' <')

    # normalize to ascii text
    if to_ascii:
        line = toascii(line, translit=True)

    # normalize to use only LF as line endings so we can split correctly
    # and keep line endings
    line = unixlinesep(line)

    # strip verbatim back slash and comment signs again at both ends of a line
    # FIXME: this is done at the start of this function already
    line = line.strip(u'\\/*#%;')

    # normalize spaces
    line = u' '.join(line.split())

    return line
github nexB / scancode-toolkit / plugins / scancode-fingerprint / src / plugin_fingerprint / fingerprint.py View on Github external
def process_shingles(self, shingle, weighted_list):
        """
        Modify weighted list wrt to shingle
        """
        # convert other encodings to ascii. See #1690.
        shingle = toascii(shingle)
        
        hash = hashlib.md5(shingle.encode()).digest()
        result = self.bitarray_from_bytes(hash)

        for idx, bit in enumerate(result):
            if bit:
                weighted_list[idx] += 1
            else:
                weighted_list[idx] -= 1

        return weighted_list
github nexB / scancode-toolkit / src / textcode / strings.py View on Github external
def strings_from_file(location, buff_size=1024 * 1024, ascii=False, clean=True, min_len=MIN_LEN):
    """
    Yield unicode strings made only of ASCII characters found in file at location.
    Process the file in chunks (to limit memory usage). If ascii is True, strings
    are converted to plain ASCII "str or byte" strings instead of unicode.
    """
    with open(location, 'rb') as f:
        while 1:
            buf = f.read(buff_size)
            if not buf:
                break
            for s in strings_from_string(buf, clean=clean, min_len=min_len):
                if ascii:
                    s = toascii(s)
                s = s.strip()
                if len(s) >= min_len:
                    yield s
github nexB / scancode-toolkit / plugins / scancode-compiledcode / src / sourcecode / classify.py View on Github external
def generated_code(location):
    '''
    Return a line of extracted text from a file if that file is likely
    generated source code.

    for each of the the first few lines of a source code file
      if generated keywords are found in the line as lowercase
         yield the line text as a 'potentially_ generated' annotation
    '''
    T = typecode.contenttype.get_type(location)
    if not T.is_text:
        return
    with open(location, 'rb') as filein:
        for line in islice(filein, max_lines):
            text = commoncode.text.toascii(line.strip())
            textl = text.lower()
            if any(kw in textl for kw in generated_keywords):
                # yield only the first 100 chars
                yield text[:100]
github nexB / scancode-toolkit / src / scancode / extract_cli.py View on Github external
def display_extract_summary():
        """
        Display a summary of warnings and errors if any.
        """
        has_warnings = False
        has_errors = False
        summary = []
        for xev in extract_results:
            has_errors = has_errors or bool(xev.errors)
            has_warnings = has_warnings or bool(xev.warnings)
            source = fileutils.as_posixpath(xev.source)
            if not isinstance(source, compat.unicode):
                source = toascii(source, translit=True).decode('utf-8', 'replace')
                source = get_relative_path(path=source, len_base_path=len_base_path, base_is_dir=base_is_dir)
            for e in xev.errors:
                echo_stderr('ERROR extracting: %(source)s: %(e)s' % locals(), fg='red')
            for warn in xev.warnings:
                echo_stderr('WARNING extracting: %(source)s: %(warn)s' % locals(), fg='yellow')

        summary_color = 'green'
        if has_warnings:
            summary_color = 'yellow'
        if has_errors:
            summary_color = 'red'

        echo_stderr('Extracting done.', fg=summary_color, reset=True)
github nexB / scancode-toolkit / src / scancode / extract_cli.py View on Github external
def extract_event(item):
        """
        Display an extract event.
        """
        if quiet:
            return ''
        if not item:
            return ''
        source = item.source
        if not isinstance(source, compat.unicode):
            source = toascii(source, translit=True).decode('utf-8', 'replace')
        if verbose:
            if item.done:
                return ''
            line = source and get_relative_path(path=source, len_base_path=len_base_path, base_is_dir=base_is_dir) or ''
        else:
            line = source and fileutils.file_name(source) or ''
        if not isinstance(line, compat.unicode):
            line = toascii(line, translit=True).decode('utf-8', 'replace')
        return 'Extracting: %(line)s' % locals()
github nexB / scancode-toolkit / src / cluecode / finder.py View on Github external
Note: the location can be a list of lines for testing convenience.
    """
    if TRACE:
        from pprint import pformat
        loc = pformat(location)
        logger_debug('find(location=%(loc)r,\n  patterns=%(patterns)r)' % locals())

    for lineno, line in analysis.numbered_text_lines(location):
        for key, pattern in patterns:
            for match in pattern.findall(line):

                if TRACE:
                    logger_debug('find: yielding match: key=%(key)r, '
                          'match=%(match)r,\n    line=%(line)r' % locals())
                yield key, toascii(match), line, lineno