How to use the t5.data.utils.TfdsTask function in t5

To help you get started, we’ve selected a few t5 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github google-research / text-to-text-transfer-transformer / t5 / data / test_utils.py View on Github external
def add_tfds_task(
    name,
    tfds_name="fake:0.0.0",
    text_preprocessor=test_text_preprocessor,
    token_preprocessor=None,
    splits=None):
  TaskRegistry.add(
      name,
      dataset_utils.TfdsTask,
      tfds_name=tfds_name,
      text_preprocessor=text_preprocessor,
      token_preprocessor=token_preprocessor,
      sentencepiece_model_path=os.path.join(TEST_DATA_DIR, "sentencepiece",
                                            "sentencepiece.model"),
      metric_fns=[],
      splits=splits)
github google-research / text-to-text-transfer-transformer / t5 / data / tasks.py View on Github external
from t5.data.utils import set_global_cache_dirs
from t5.data.utils import TaskRegistry
from t5.data.utils import TfdsTask
from t5.evaluation import metrics
import tensorflow_datasets as tfds




# ==================================== C4 ======================================
_c4_config_suffixes = ["", ".noclean", ".realnewslike", ".webtextlike"]
for config_suffix in _c4_config_suffixes:
  TaskRegistry.add(
      "c4{name}_v020_unsupervised".format(
          name=config_suffix.replace(".", "_")),
      TfdsTask,
      tfds_name="c4/en{config}:1.0.0".format(config=config_suffix),
      text_preprocessor=functools.partial(
          preprocessors.rekey, key_map={"inputs": None, "targets": "text"}),
      token_preprocessor=preprocessors.unsupervised,
      sentencepiece_model_path=DEFAULT_SPM_PATH,
      metric_fns=[])

# ================================ Wikipedia ===================================
TaskRegistry.add(
    "wikipedia_20190301.en_v003_unsupervised",
    TfdsTask,
    # 0.0.4 is identical to 0.0.3 except empty records removed.
    tfds_name="wikipedia/20190301.en:0.0.4",
    text_preprocessor=functools.partial(
        preprocessors.rekey, key_map={"inputs": None, "targets": "text"}),
    token_preprocessor=preprocessors.unsupervised,
github google-research / text-to-text-transfer-transformer / t5 / data / utils.py View on Github external
to use.
      splits: list(string) or None, a list of allowable splits to load. The
        default, None, uses all available splits from the TFDS dataset info.
      **task_kwargs: dict, additional keyword arguments for the parent `Task`
        class.
    """
    if ":" not in tfds_name:
      raise ValueError(
          "TFDS name must contain a version number, got: %s" % tfds_name)

    self._tfds_dataset = LazyTfdsLoader(tfds_name, tfds_data_dir)

    def dataset_fn(split, shuffle_files):
      return self._tfds_dataset.load(split, shuffle_files)

    super(TfdsTask, self).__init__(
        name,
        dataset_fn=dataset_fn,
        splits=splits,
        text_preprocessor=text_preprocessor,
        sentencepiece_model_path=sentencepiece_model_path,
        metric_fns=metric_fns,
        **task_kwargs)
github google-research / text-to-text-transfer-transformer / t5 / data / tasks.py View on Github external
for config_suffix in _c4_config_suffixes:
  TaskRegistry.add(
      "c4{name}_v020_unsupervised".format(
          name=config_suffix.replace(".", "_")),
      TfdsTask,
      tfds_name="c4/en{config}:1.0.0".format(config=config_suffix),
      text_preprocessor=functools.partial(
          preprocessors.rekey, key_map={"inputs": None, "targets": "text"}),
      token_preprocessor=preprocessors.unsupervised,
      sentencepiece_model_path=DEFAULT_SPM_PATH,
      metric_fns=[])

# ================================ Wikipedia ===================================
TaskRegistry.add(
    "wikipedia_20190301.en_v003_unsupervised",
    TfdsTask,
    # 0.0.4 is identical to 0.0.3 except empty records removed.
    tfds_name="wikipedia/20190301.en:0.0.4",
    text_preprocessor=functools.partial(
        preprocessors.rekey, key_map={"inputs": None, "targets": "text"}),
    token_preprocessor=preprocessors.unsupervised,
    sentencepiece_model_path=DEFAULT_SPM_PATH,
    metric_fns=[])


# =================================== GLUE =====================================
def _get_glue_text_preprocessor(builder_config):
  """Return the glue preprocessor.

  Args:
    builder_config: a BuilderConfig
  Returns: