Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
record.get("decision_date")
and date_range[0] <= record["decision_date"] < date_range[1]
)
)
if opinion_author is not None:
opinion_author = utils.validate_set_members(
opinion_author, int, valid_vals=self.opinion_author_codes)
filters.append(
lambda record: record.get("maj_opinion_author") in opinion_author)
if decision_direction is not None:
decision_direction = utils.validate_set_members(
decision_direction, (str, bytes), valid_vals=self.decision_directions)
filters.append(
lambda record: record.get("decision_direction") in decision_direction)
if issue_area is not None:
issue_area = utils.validate_set_members(
issue_area, int, valid_vals=self.issue_area_codes)
filters.append(
lambda record: record.get("issue_area") in issue_area)
return filters
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
from . import utils
LOGGER = logging.getLogger(__name__)
utils.deprecated(
"The `spacy_pipelines` module is deprecated and will be removed in v0.7.0."
"Use the `textacy.spacier` subpackage instead.",
action="once",
)
def _merge_entities(doc):
"""
Merge named entities *in-place* within parent ``doc`` so that each becomes
a single token.
Args:
doc (``spacy.doc``)
"""
for ent in doc.ents:
try:
a "uniformly sampled" collection of ~120k tweets over all languages and
a "recall oriented" collection of ~1.5k tweets per language --
then fetch available tweets' data from the Twitter API.
Args:
dirpath (str or :class:`pathlib.Path`)
creds_fpath (str or :class:`pathlib.Path`)
force (bool)
References:
https://blog.twitter.com/engineering/en_us/a/2015/evaluating-language-identification-performance.html
TODO: Ideally, use a tweet search endpoint and filter by language,
then just iterate over all ISO-639-1 language codes.
"""
dirpath = textacy.utils.to_path(dirpath).resolve()
url_fnames = [
(
"https://raw.githubusercontent.com/mitjat/langid_eval/master/uniformly_sampled.tsv",
"uniformly_sampled.tsv",
),
(
"https://raw.githubusercontent.com/mitjat/langid_eval/master/recall_oriented.tsv",
"recall_oriented.tsv",
)
]
# download tweet ids first
for url, fname in url_fnames:
textacy.io.download_file(url, filename=fname, dirpath=dirpath, force=force)
# download full tweets data next
tweets_fpath = dirpath.joinpath("tweets.jsonl")
if tweets_fpath.is_file() and force is False:
from __future__ import absolute_import, division, print_function, unicode_literals
import itertools
import logging
from spacy.symbols import NOUN, PROPN, VERB
from spacy.tokens.token import Token as SpacyToken
from spacy.tokens.span import Span as SpacySpan
from . import constants
from . import text_utils
from . import utils
LOGGER = logging.getLogger(__name__)
utils.deprecated(
"The `spacy_utils` module is deprecated and will be removed in v0.7.0."
"Use the `textacy.spacier` subpackage instead.",
action="once",
)
def is_plural_noun(token):
"""
Returns True if token is a plural noun, False otherwise.
Args:
token (``spacy.Token``): parent document must have POS information
Returns:
bool
"""