How to use the clevercsv.cparser_util.parse_string function in clevercsv

To help you get started, we’ve selected a few clevercsv examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github alan-turing-institute / CleverCSV / clevercsv / break_ties.py View on Github external
if count > 0 and count % 2 == 0:
                return Descape
            else:
                return Dnone
    elif A.delimiter == B.delimiter:
        Aq, Ae = A.quotechar, A.escapechar
        Bq, Be = B.quotechar, B.escapechar
        if (Aq, Ae) == ("", "") or (Bq, Be) == ("", ""):
            # This case is activated if the escapechar+quotechar combination
            # occurs in the cells (i.e. "Jill\'s data") but no actual quoting
            # is done with the quote character.
            d_no = A if (Aq, Ae) == ("", "") else B
            d_yes = B if d_no == A else A

            X = list(parse_string(data, dialect=d_no))
            Y = list(parse_string(data, dialect=d_yes))

            if len(X) != len(Y):
                return None
            for x, y in zip(X, Y):
                if len(x) != len(y):
                    return None

            # if we're here, then there is no effect on structure.
            # we test if the only cells that differ are those that have an
            # escapechar+quotechar combination.
            eq = d_yes.escapechar + d_yes.quotechar
            for rX, rY in zip(X, Y):
                for x, y in zip(rX, rY):
                    if x != y:
                        if not eq in x:
                            return None
github alan-turing-institute / CleverCSV / clevercsv / break_ties.py View on Github external
# Artifact due to type detection (comma as radix point)
            if A.delimiter == ",":
                return A
            else:
                return B
        elif A.delimiter == "-" or B.delimiter == "-":
            # Artifact due to type detection (dash as minus sign)
            if A.delimiter == "-":
                return B
            else:
                return A
    elif diff_only_in_key("escapechar"):
        Dnone, Descape = (A, B) if A.escapechar == "" else (B, A)

        X = list(parse_string(data, Dnone))
        Y = list(parse_string(data, Descape))

        # double check shape. Usually if the shape differs the pattern score
        # should have caught it, but if by a freakish occurance it hasn't then
        # we can't break this tie (for now)
        if len(X) != len(Y):
            return None
        for x, y in zip(X, Y):
            if len(x) != len(y):
                return None

        cells_escaped = []
        cells_unescaped = []
        for x, y in zip(X, Y):
            for u, v in zip(x, y):
                if u != v:
                    cells_unescaped.append(u)
github alan-turing-institute / CleverCSV / clevercsv / detect_type.py View on Github external
Parameters
    ----------
    data: str
        the data as a single string

    dialect: SimpleDialect
        the dialect to use

    eps: float
        the minimum value of the type score

    """
    total = 0
    known = 0
    td = TypeDetector()
    for row in parse_string(data, dialect, return_quoted=True):
        for cell, is_quoted in row:
            total += 1
            known += td.is_known_type(cell, is_quoted=is_quoted)
    if total == 0:
        return eps
    return max(eps, known / total)
github alan-turing-institute / CleverCSV / clevercsv / break_ties.py View on Github external
Returns
    -------
    dialects: list
        List of SimpleDialect objects.

    """
    equal_delim = len(set([d.delimiter for d in dialects])) == 1
    if not equal_delim:
        return None

    # First, identify dialects that result in the same parsing result.
    equal_dialects = []
    for a, b in pairwise(dialects):
        X = list(parse_string(data, a))
        Y = list(parse_string(data, b))
        if X == Y:
            equal_dialects.append((a, b))

    # Try to break the ties in these pairs
    new_dialects = set()
    visited = set()
    for A, B in equal_dialects:
        ans = break_ties_two(data, A, B)
        if not ans is None:
            new_dialects.add(ans)
        visited.add(A)
        visited.add(B)

    # and add the dialects that we didn't visit
    for d in dialects:
        if not d in visited:
github alan-turing-institute / CleverCSV / clevercsv / break_ties.py View on Github external
if sorted([A.delimiter, B.delimiter]) == sorted([",", " "]):
            # Artifact due to type detection (comma as radix point)
            if A.delimiter == ",":
                return A
            else:
                return B
        elif A.delimiter == "-" or B.delimiter == "-":
            # Artifact due to type detection (dash as minus sign)
            if A.delimiter == "-":
                return B
            else:
                return A
    elif diff_only_in_key("escapechar"):
        Dnone, Descape = (A, B) if A.escapechar == "" else (B, A)

        X = list(parse_string(data, Dnone))
        Y = list(parse_string(data, Descape))

        # double check shape. Usually if the shape differs the pattern score
        # should have caught it, but if by a freakish occurance it hasn't then
        # we can't break this tie (for now)
        if len(X) != len(Y):
            return None
        for x, y in zip(X, Y):
            if len(x) != len(y):
                return None

        cells_escaped = []
        cells_unescaped = []
        for x, y in zip(X, Y):
            for u, v in zip(x, y):
                if u != v:
github alan-turing-institute / CleverCSV / clevercsv / break_ties.py View on Github external
Notes
    -----
    We have only observed one tie for each case during development, so
    this may need to be improved in the future.

    """

    equal_delim = A.delimiter == B.delimiter == C.delimiter
    equal_escape = A.escapechar == B.escapechar == C.escapechar

    if equal_delim and equal_escape:
        # difference is *only* in quotechar
        dialects = [A, B, C]

        pA = list(parse_string(data, A))
        pB = list(parse_string(data, B))
        pC = list(parse_string(data, C))

        if len(pA) != len(pB) or len(pA) != len(pC) or len(pB) != len(pC):
            return None

        p_none, d_none = next(
            (
                (p, d)
                for p, d in zip([pA, pB, pC], dialects)
                if d.quotechar == ""
            ),
            (None, None),
        )
        if p_none is None:
            return None
github alan-turing-institute / CleverCSV / clevercsv / break_ties.py View on Github external
List of SimpleDialect objects

    Returns
    -------
    dialects: list
        List of SimpleDialect objects.

    """
    equal_delim = len(set([d.delimiter for d in dialects])) == 1
    if not equal_delim:
        return None

    # First, identify dialects that result in the same parsing result.
    equal_dialects = []
    for a, b in pairwise(dialects):
        X = list(parse_string(data, a))
        Y = list(parse_string(data, b))
        if X == Y:
            equal_dialects.append((a, b))

    # Try to break the ties in these pairs
    new_dialects = set()
    visited = set()
    for A, B in equal_dialects:
        ans = break_ties_two(data, A, B)
        if not ans is None:
            new_dialects.add(ans)
        visited.add(A)
        visited.add(B)

    # and add the dialects that we didn't visit
    for d in dialects:
github alan-turing-institute / CleverCSV / clevercsv / break_ties.py View on Github external
Notes
    -----
    We have only observed one tie for each case during development, so
    this may need to be improved in the future.

    """

    equal_delim = A.delimiter == B.delimiter == C.delimiter
    equal_escape = A.escapechar == B.escapechar == C.escapechar

    if equal_delim and equal_escape:
        # difference is *only* in quotechar
        dialects = [A, B, C]

        pA = list(parse_string(data, A))
        pB = list(parse_string(data, B))
        pC = list(parse_string(data, C))

        if len(pA) != len(pB) or len(pA) != len(pC) or len(pB) != len(pC):
            return None

        p_none, d_none = next(
            (
                (p, d)
                for p, d in zip([pA, pB, pC], dialects)
                if d.quotechar == ""
            ),
            (None, None),
        )
        if p_none is None:
            return None
github alan-turing-institute / CleverCSV / clevercsv / break_ties.py View on Github external
-----
    We have only observed one tie for each case during development, so
    this may need to be improved in the future.

    """

    equal_delim = A.delimiter == B.delimiter == C.delimiter
    equal_escape = A.escapechar == B.escapechar == C.escapechar

    if equal_delim and equal_escape:
        # difference is *only* in quotechar
        dialects = [A, B, C]

        pA = list(parse_string(data, A))
        pB = list(parse_string(data, B))
        pC = list(parse_string(data, C))

        if len(pA) != len(pB) or len(pA) != len(pC) or len(pB) != len(pC):
            return None

        p_none, d_none = next(
            (
                (p, d)
                for p, d in zip([pA, pB, pC], dialects)
                if d.quotechar == ""
            ),
            (None, None),
        )
        if p_none is None:
            return None

        rem = [