How to use the textacy.io.utils.open_sesame function in textacy

To help you get started, we’ve selected a few textacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github chartbeat-labs / textacy / textacy / io / json.py View on Github external
or streaming item-by-item.

    Args:
        filepath (str): Path to file on disk from which data will be read.
        mode (str): Mode with which ``filepath`` is opened.
        encoding (str): Name of the encoding used to decode or encode the data
            in ``filepath``. Only applicable in text mode.
        lines (bool): If False, all data is read in at once; otherwise, data is
            read in one line at a time.

    Yields:
        object: Next JSON item; could be a dict, list, int, float, str,
        depending on the value of ``lines``.
    """
    _validate_read_mode(mode)
    with open_sesame(filepath, mode=mode, encoding=encoding) as f:
        if lines is False:
            yield json.load(f)
        else:
            for line in f:
                yield json.loads(line)
github chartbeat-labs / textacy / textacy / io / write.py View on Github external
def write_file(content, filepath, mode='wt', encoding=None,
               make_dirs=False):
    """
    Write ``content`` to disk at ``filepath``. Files with appropriate extensions
    are compressed with gzip or bz2 automatically. Any intermediate folders
    not found on disk may automatically be created.

    See Also:
        :func:`open_sesame() `
    """
    with open_sesame(filepath, mode=mode, encoding=encoding,
                     make_dirs=make_dirs) as f:
        f.write(content)
github chartbeat-labs / textacy / textacy / io / read.py View on Github external
Args:
        filepath (str): /path/to/file on disk from which json objects will be streamed,
            where all json objects are mashed together, end-to-end, on a single line,;
            for example::

                {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}{"title": "2BR02B", "text": "Everything was perfectly swell."}

        mode (str, optional)
        encoding (str, optional)
        buffersize (int, optional): number of bytes to read in as a chunk

    Yields:
        dict: Next valid JSON object, converted to native Python equivalent.
    """
    with open_sesame(filepath, mode=mode, encoding=encoding) as f:
        buffer = ''
        for chunk in iter(functools.partial(f.read, buffersize), ''):
            buffer += chunk
            while buffer:
                try:
                    result, index = JSON_DECODER.raw_decode(buffer)
                    yield result
                    buffer = buffer[index:]
                # not enough data to decode => read another chunk
                except ValueError:
                    break
github chartbeat-labs / textacy / textacy / io / csv.py View on Github external
Yields:
        List[obj]: Next row, whose elements are strings and/or floats.
        If ``fieldnames`` is None or 'infer' doesn't detect a header row.

        *or*

        Dict[str, obj]: Next row, as an ordered dictionary of (key, value) pairs,
        where keys are column names and values are the corresponding strings
        and/or floats. If ``fieldnames`` is a list of column names or 'infer'
        detects a header row.

    See Also:
        https://docs.python.org/3/library/csv.html#csv.reader
    """
    has_header = False
    with open_sesame(filepath, mode="rt", encoding=encoding, newline="") as f:
        if dialect == "infer" or fieldnames == "infer":
            sniffer = compat.csv.Sniffer()
            # add pipes to the list of preferred delimiters, and put spaces last
            sniffer.preferred = [",", "\t", "|", ";", ":", " "]
            # sample = "".join(f.readline() for _ in range(5))  # f.read(1024)
            sample = f.read(1024)
            if dialect == "infer":
                dialect = sniffer.sniff(sample)
            if fieldnames == "infer":
                has_header = sniffer.has_header(sample)
            f.seek(0)
        if has_header is True:
            csv_reader = compat.csv.DictReader(
                f,
                fieldnames=None,
                dialect=dialect,
github chartbeat-labs / textacy / textacy / io / read.py View on Github external
def read_spacy_docs(filepath):
    """
    Stream ``spacy.Doc`` s from disk at ``filepath`` where they were serialized
    via pickle.

    Args:
        filepath (str): /path/to/file on disk from which spacy docs will be streamed

    Yields:
        Next deserialized ``spacy.Doc``.
    """
    with open_sesame(filepath, mode='rb') as f:
        for spacy_doc in compat.pickle.load(f):
            yield spacy_doc
github chartbeat-labs / textacy / textacy / io / read.py View on Github external
]

        mode (str, optional)
        encoding (str, optional)
        prefix (str, optional): if '', the entire JSON object will be read in at once;
            if 'item', each item in a top-level array will be read in successively;
            if 'item.text', each array item's 'text' value will be read in successively

    Yields:
        Next matching JSON object; could be a dict, list, int, float, str,
        depending on the value of ``prefix``.

    Note:
        Refer to ``ijson`` at https://pypi.python.org/pypi/ijson/ for usage details.
    """
    with open_sesame(filepath, mode=mode, encoding=encoding) as f:
        if prefix == '':
            yield json.load(f)
        else:
            for item in ijson.items(f, prefix):
                yield item
github chartbeat-labs / textacy / textacy / io / read.py View on Github external
Args:
        filepath (str): /path/to/file on disk from which rows will be streamed
        encoding (str)
        dialect (str): a grouping of formatting parameters that determine how
            the tabular data is parsed when reading/writing; if 'infer', the
            first 1024 bytes of the file is analyzed, producing a best guess for
            the correct dialect
        delimiter (str): 1-character string used to separate fields in a row

    Yields:
        List[obj]: Next row, whose elements are strings and/or numbers.

    .. seealso:: https://docs.python.org/3/library/csv.html#csv.reader
    """
    with open_sesame(filepath, mode='rt', encoding=encoding, newline='') as f:
        if dialect == 'infer':
            sniffer = compat.csv.Sniffer()
            # add pipes to the list of preferred delimiters, and put spaces last
            sniffer.preferred = [',', '\t', '|', ';', ':', ' ']
            dialect = sniffer.sniff(f.read(1024))
            f.seek(0)
        for row in compat.csv.reader(f, dialect=dialect, delimiter=delimiter):
            yield row
github chartbeat-labs / textacy / textacy / io / write.py View on Github external
encoding (str)
        make_dirs (bool)
        dialect (str): a grouping of formatting parameters that determine how
            the tabular data is parsed when reading/writing
        delimiter (str): 1-character string used to separate fields in a row

    See Also:
        https://docs.python.org/3/library/csv.html#csv.writer

    Note:
        Here, CSV is used as a catch-all term for *any* delimited file
        format, and ``delimiter=','`` is merely the function's default value.
        Other common delimited formats are TSV (tab-separated-value, with
        ``delimiter='\\t'``) and PSV (pipe-separated-value, with ``delimiter='|'``.
    """
    with open_sesame(filepath, mode='wt', encoding=encoding, newline='') as f:
        csv_writer = compat.csv.writer(f, dialect=dialect, delimiter=delimiter)
        csv_writer.writerows(rows)
github chartbeat-labs / textacy / textacy / io / text.py View on Github external
Args:
        filepath (str): Path to file on disk from which data will be read.
        mode (str): Mode with which ``filepath`` is opened.
        encoding (str): Name of the encoding used to decode or encode the data
            in ``filepath``. Only applicable in text mode.
        lines (bool): If False, all data is read in at once; otherwise, data is
            read in one line at a time.

    Yields:
        str: Next line of text to read in.

        If ``lines`` is False, wrap this output in :func:`next()` to conveniently
        access the full text.
    """
    _validate_read_mode(mode)
    with open_sesame(filepath, mode=mode, encoding=encoding) as f:
        if lines is False:
            yield f.read()
        else:
            for line in f:
                yield line
github chartbeat-labs / textacy / textacy / io / read.py View on Github external
def read_file(filepath, mode='rt', encoding=None):
    """
    Read the full contents of a file. Files compressed with gzip, bz2, or lzma
    are handled automatically.
    """
    with open_sesame(filepath, mode=mode, encoding=encoding) as f:
        return f.read()