Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def add_tfds_task(
name,
tfds_name="fake:0.0.0",
text_preprocessor=test_text_preprocessor,
token_preprocessor=None,
splits=None):
TaskRegistry.add(
name,
dataset_utils.TfdsTask,
tfds_name=tfds_name,
text_preprocessor=text_preprocessor,
token_preprocessor=token_preprocessor,
sentencepiece_model_path=os.path.join(TEST_DATA_DIR, "sentencepiece",
"sentencepiece.model"),
metric_fns=[],
splits=splits)
from t5.data.utils import set_global_cache_dirs
from t5.data.utils import TaskRegistry
from t5.data.utils import TfdsTask
from t5.evaluation import metrics
import tensorflow_datasets as tfds
# ==================================== C4 ======================================
_c4_config_suffixes = ["", ".noclean", ".realnewslike", ".webtextlike"]
for config_suffix in _c4_config_suffixes:
TaskRegistry.add(
"c4{name}_v020_unsupervised".format(
name=config_suffix.replace(".", "_")),
TfdsTask,
tfds_name="c4/en{config}:1.0.0".format(config=config_suffix),
text_preprocessor=functools.partial(
preprocessors.rekey, key_map={"inputs": None, "targets": "text"}),
token_preprocessor=preprocessors.unsupervised,
sentencepiece_model_path=DEFAULT_SPM_PATH,
metric_fns=[])
# ================================ Wikipedia ===================================
TaskRegistry.add(
"wikipedia_20190301.en_v003_unsupervised",
TfdsTask,
# 0.0.4 is identical to 0.0.3 except empty records removed.
tfds_name="wikipedia/20190301.en:0.0.4",
text_preprocessor=functools.partial(
preprocessors.rekey, key_map={"inputs": None, "targets": "text"}),
token_preprocessor=preprocessors.unsupervised,
to use.
splits: list(string) or None, a list of allowable splits to load. The
default, None, uses all available splits from the TFDS dataset info.
**task_kwargs: dict, additional keyword arguments for the parent `Task`
class.
"""
if ":" not in tfds_name:
raise ValueError(
"TFDS name must contain a version number, got: %s" % tfds_name)
self._tfds_dataset = LazyTfdsLoader(tfds_name, tfds_data_dir)
def dataset_fn(split, shuffle_files):
return self._tfds_dataset.load(split, shuffle_files)
super(TfdsTask, self).__init__(
name,
dataset_fn=dataset_fn,
splits=splits,
text_preprocessor=text_preprocessor,
sentencepiece_model_path=sentencepiece_model_path,
metric_fns=metric_fns,
**task_kwargs)
for config_suffix in _c4_config_suffixes:
TaskRegistry.add(
"c4{name}_v020_unsupervised".format(
name=config_suffix.replace(".", "_")),
TfdsTask,
tfds_name="c4/en{config}:1.0.0".format(config=config_suffix),
text_preprocessor=functools.partial(
preprocessors.rekey, key_map={"inputs": None, "targets": "text"}),
token_preprocessor=preprocessors.unsupervised,
sentencepiece_model_path=DEFAULT_SPM_PATH,
metric_fns=[])
# ================================ Wikipedia ===================================
TaskRegistry.add(
"wikipedia_20190301.en_v003_unsupervised",
TfdsTask,
# 0.0.4 is identical to 0.0.3 except empty records removed.
tfds_name="wikipedia/20190301.en:0.0.4",
text_preprocessor=functools.partial(
preprocessors.rekey, key_map={"inputs": None, "targets": "text"}),
token_preprocessor=preprocessors.unsupervised,
sentencepiece_model_path=DEFAULT_SPM_PATH,
metric_fns=[])
# =================================== GLUE =====================================
def _get_glue_text_preprocessor(builder_config):
"""Return the glue preprocessor.
Args:
builder_config: a BuilderConfig
Returns: