Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _get_filters(self, author, date_range, min_len):
filters = []
if min_len is not None:
if min_len < 1:
raise ValueError("`min_len` must be at least 1")
filters.append(
lambda record: len(record.get("text", "")) >= min_len
)
if author is not None:
author = utils.validate_set_members(
author, (str, bytes), valid_vals=self.authors)
filters.append(
lambda record: record.get("author") and any(athr in author for athr in record["author"])
)
if date_range is not None:
date_range = utils.validate_and_clip_range(
date_range, self.full_date_range, val_type=(str, bytes))
filters.append(
lambda record: record.get("year") and date_range[0] <= record["year"] < date_range[1]
)
return filters
def _get_filters(self, subreddit, date_range, score_range, min_len):
filters = []
if min_len is not None:
if min_len < 1:
raise ValueError("`min_len` must be at least 1")
filters.append(
lambda record: len(record.get("body", "")) >= min_len
)
if subreddit is not None:
subreddit = utils.validate_set_members(subreddit, (str, bytes))
filters.append(
lambda record: record.get("subreddit") in subreddit
)
if date_range is not None:
date_range = utils.validate_and_clip_range(
date_range, self.full_date_range, val_type=(str, bytes))
filters.append(
lambda record: (
record.get("created_utc")
and date_range[0] <= record["created_utc"] < date_range[1]
)
)
if score_range is not None:
score_range = utils.validate_and_clip_range(
score_range, self._full_score_range, val_type=(int, float))
filters.append(
lambda record: (
record.get("score")
and score_range[0] <= record["score"] < score_range[1]
)
)
speaker_name,
speaker_party,
chamber,
congress,
date_range,
min_len,
):
filters = []
if min_len is not None:
if min_len < 1:
raise ValueError("`min_len` must be at least 1")
filters.append(
lambda record: len(record.get("text", "")) >= min_len
)
if date_range is not None:
date_range = utils.validate_and_clip_range(
date_range, self.full_date_range, val_type=(str, bytes))
filters.append(
lambda record: (
record.get("date")
and date_range[0] <= record["date"] < date_range[1]
)
)
if speaker_name is not None:
speaker_name = utils.validate_set_members(
speaker_name, (str, bytes), valid_vals=self.speaker_names)
filters.append(lambda record: record.get("speaker_name") in speaker_name)
if speaker_party is not None:
speaker_party = utils.validate_set_members(
speaker_party, (str, bytes), valid_vals=self.speaker_parties)
filters.append(lambda record: record.get("speaker_party") in speaker_party)
if chamber is not None:
self,
opinion_author,
decision_direction,
issue_area,
date_range,
min_len,
):
filters = []
if min_len is not None:
if min_len < 1:
raise ValueError("`min_len` must be at least 1")
filters.append(
lambda record: len(record.get("text", "")) >= min_len
)
if date_range is not None:
date_range = utils.validate_and_clip_range(
date_range, self.full_date_range, val_type=(str, bytes))
filters.append(
lambda record: (
record.get("decision_date")
and date_range[0] <= record["decision_date"] < date_range[1]
)
)
if opinion_author is not None:
opinion_author = utils.validate_set_members(
opinion_author, int, valid_vals=self.opinion_author_codes)
filters.append(
lambda record: record.get("maj_opinion_author") in opinion_author)
if decision_direction is not None:
decision_direction = utils.validate_set_members(
decision_direction, (str, bytes), valid_vals=self.decision_directions)
filters.append(
def _get_filters(self, rating_range, min_len):
filters = []
if min_len is not None:
if min_len < 1:
raise ValueError("`min_len` must be at least 1")
filters.append(
lambda record: len(record.get("text", "")) >= min_len
)
if rating_range is not None:
rating_range = utils.validate_and_clip_range(
rating_range, self.full_rating_range, val_type=int)
filters.append(
lambda record: (
record.get("rating")
and rating_range[0] <= record["rating"] < rating_range[1]
)
)
return filters
def download(self, *, date_range=(None, None), force=False):
"""
Download 1 or more monthly Reddit comments files from archive.org
and save them to disk under the ``data_dir`` directory.
Args:
date_range (Tuple[str]): Interval specifying the [start, end) dates
for which comments files will be downloaded. Each item must be
a str formatted as YYYY-MM or YYYY-MM-DD (the latter is converted
to the corresponding YYYY-MM value). Both start and end values
must be specified, but a null value for either is automatically
replaced by the minimum or maximum valid values, respectively.
force (bool): If True, download the dataset, even if it already
exists on disk under ``data_dir``.
"""
date_range = utils.validate_and_clip_range(
date_range, self.full_date_range, val_type=(str, bytes))
filestubs = self._generate_filestubs(date_range)
for filestub in filestubs:
tio.download_file(
urllib.parse.urljoin(DOWNLOAD_ROOT, filestub),
filename=filestub,
dirpath=self.data_dir,
force=force,
)
if subreddit is not None:
subreddit = utils.validate_set_members(subreddit, (str, bytes))
filters.append(
lambda record: record.get("subreddit") in subreddit
)
if date_range is not None:
date_range = utils.validate_and_clip_range(
date_range, self.full_date_range, val_type=(str, bytes))
filters.append(
lambda record: (
record.get("created_utc")
and date_range[0] <= record["created_utc"] < date_range[1]
)
)
if score_range is not None:
score_range = utils.validate_and_clip_range(
score_range, self._full_score_range, val_type=(int, float))
filters.append(
lambda record: (
record.get("score")
and score_range[0] <= record["score"] < score_range[1]
)
)
return filters