Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
raw_path = os.path.abspath(opt['raw_dataset_path'] or ".")
train_file = os.path.join(raw_path, 'train.csv')
valid_file = os.path.join(raw_path, 'test_with_solutions.csv')
test_file = os.path.join(raw_path, 'impermium_verification_labels.csv')
if not os.path.isfile(train_file) or not os.path.isfile(valid_file) or not os.path.isfile(test_file):
ds_path = os.environ.get('DATASETS_URL')
file_name = 'insults.tar.gz'
if not ds_path:
raise RuntimeError('Please download dataset files from'
' https://www.kaggle.com/c/detecting-insults-in-social-commentary/data'
' and set path to their directory in raw-dataset-path parameter')
print('Trying to download a insults dataset from the repository')
url = urllib.parse.urljoin(ds_path, file_name)
print(repr(url))
build_data.download(url, dpath, file_name)
build_data.untar(dpath, file_name)
opt['raw_dataset_path'] = dpath
print('Downloaded a insults dataset')
raw_path = os.path.abspath(opt['raw_dataset_path'])
train_file = os.path.join(raw_path, 'train.csv')
valid_file = os.path.join(raw_path, 'test_with_solutions.csv')
test_file = os.path.join(raw_path, 'impermium_verification_labels.csv')
train_data = pd.read_csv(train_file)
train_data = train_data.drop('Date', axis=1)
test_data = pd.read_csv(test_file)
test_data = test_data.drop('id', axis=1)
test_data = test_data.drop('Usage', axis=1)
test_data = test_data.drop('Date', axis=1)
def build(opt):
dpath = os.path.join(opt['datapath'], 'WikiQA')
version = None
if not build_data.built(dpath, version_string=version):
print('[building data: ' + dpath + ']')
if build_data.built(dpath):
# An older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
# Download the data.
fname = 'wikiqa.tar.gz'
url = 'http://parl.ai/downloads/wikiqa/' + fname
build_data.download(url, dpath, fname)
build_data.untar(dpath, fname)
dpext = os.path.join(dpath, 'WikiQACorpus')
create_fb_format(dpath, 'train', os.path.join(dpext, 'WikiQA-train.tsv'))
create_fb_format(dpath, 'valid', os.path.join(dpext, 'WikiQA-dev.tsv'))
create_fb_format(dpath, 'test', os.path.join(dpext, 'WikiQA-test.tsv'))
create_fb_format(
dpath, 'train-filtered', os.path.join(dpext, 'WikiQA-train.tsv')
)
create_fb_format(dpath, 'valid-filtered', os.path.join(dpext, 'WikiQA-dev.tsv'))
create_fb_format(dpath, 'test-filtered', os.path.join(dpext, 'WikiQA-test.tsv'))
# Mark the data as built.
build_data.mark_done(dpath, version_string=version)
version = None
# check if data had been previously built
if not build_data.built(dpath, version_string=version):
print('[building data: ' + dpath + ']')
# make a clean directory if needed
if build_data.built(dpath):
# an older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
# download the data.
fname = 'mnist.tar.gz'
url = 'https://s3.amazonaws.com/fair-data/parlai/mnist/' + fname # dataset URL
build_data.download(url, dpath, fname)
# uncompress it
build_data.untar(dpath, fname)
# mark the data as built
build_data.mark_done(dpath, version_string=version)
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
# Download the data.
fname1 = 'Questions_Train_mscoco.zip'
fname2 = 'Questions_Val_mscoco.zip'
fname3 = 'Questions_Test_mscoco.zip'
fname4 = 'Annotations_Val_mscoco.zip'
fname5 = 'Annotations_Train_mscoco.zip'
url = 'http://visualqa.org/data/mscoco/vqa/'
build_data.download(url + fname1, dpath, fname1)
build_data.download(url + fname2, dpath, fname2)
build_data.download(url + fname3, dpath, fname3)
build_data.download(url + fname4, dpath, fname4)
build_data.download(url + fname5, dpath, fname5)
build_data.untar(dpath, fname1)
build_data.untar(dpath, fname2)
build_data.untar(dpath, fname3)
build_data.untar(dpath, fname4)
build_data.untar(dpath, fname5)
# Mark the data as built.
build_data.mark_done(dpath, version_string=version)
build_data.remove_dir(dpath)
# Build the folders tree
build_data.make_dir(dpath)
build_data.make_dir(join(dpath, 'scorer'))
build_data.make_dir(join(dpath, 'train'))
build_data.make_dir(join(dpath, 'valid'))
# urls
dataset_url = 'http://rucoref.maimbava.net/files/rucoref_29.10.2015.zip'
scorer_url = 'http://conll.cemantix.org/download/reference-coreference-scorers.v8.01.tar.gz'
# download the conll-2012 scorer v 8.1
start = time.time()
print('[Download the conll-2012 scorer]...')
build_data.download(scorer_url, join(dpath, 'scorer'), 'reference-coreference-scorers.v8.01.tar.gz')
build_data.untar(join(dpath, 'scorer'), 'reference-coreference-scorers.v8.01.tar.gz')
print('[Scorer was dawnloads]...')
# download dataset
fname = 'rucoref_29.10.2015.zip'
print('[Download the rucoref dataset]...')
build_data.make_dir(join(dpath, 'rucoref_29.10.2015'))
build_data.download(dataset_url, join(dpath, 'rucoref_29.10.2015'), fname)
# uncompress it
build_data.untar(join(dpath, 'rucoref_29.10.2015'), 'rucoref_29.10.2015.zip')
print('End of download: time - {}'.format(time.time()-start))
# Convertation rucorpus files in conll files
conllpath = join(dpath, 'ru_conll')
build_data.make_dir(conllpath)
utils.RuCoref2CoNLL(join(dpath, 'rucoref_29.10.2015'), conllpath, language)
def build(opt):
data_path = os.path.join(opt['datapath'], 'DialogueQE')
version = '1501534800'
if not build_data.built(data_path, version_string=version):
print('[building data: ' + data_path + ']')
if build_data.built(data_path):
build_data.remove_dir(data_path)
build_data.make_dir(data_path)
fname = 'data_' + version + '.tar.gz'
url = 'https://raw.githubusercontent.com/deepmipt/turing-data/master/' + fname
build_data.download(url, data_path, fname)
build_data.untar(data_path, fname)
os.rename(
os.path.join(data_path, 'data_train_' + version + '.json'),
os.path.join(data_path, 'train.json'),
)
os.rename(
os.path.join(data_path, 'data_test_' + version + '.json'),
os.path.join(data_path, 'test.json'),
)
build_data.mark_done(data_path, version_string=version)
def setup_personas_with_wiki_links(opt):
fname = 'personas_with_wiki_links.txt'
file_path = '{}/{}'.format(os.getcwd(), fname)
if not os.path.exists(file_path):
url = 'http://parl.ai/downloads/wizard_of_wikipedia/' + fname
build_data.download(url, os.getcwd(), fname)
if build_data.built(dpath):
# An older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
# Download the data.
fname1 = 'Questions_Train_mscoco.zip'
fname2 = 'Questions_Val_mscoco.zip'
fname3 = 'Questions_Test_mscoco.zip'
fname4 = 'Annotations_Val_mscoco.zip'
fname5 = 'Annotations_Train_mscoco.zip'
url = 'http://visualqa.org/data/mscoco/vqa/'
build_data.download(url + fname1, dpath, fname1)
build_data.download(url + fname2, dpath, fname2)
build_data.download(url + fname3, dpath, fname3)
build_data.download(url + fname4, dpath, fname4)
build_data.download(url + fname5, dpath, fname5)
build_data.untar(dpath, fname1)
build_data.untar(dpath, fname2)
build_data.untar(dpath, fname3)
build_data.untar(dpath, fname4)
build_data.untar(dpath, fname5)
# Mark the data as built.
build_data.mark_done(dpath, version_string=version)
def build(opt):
dpath = os.path.join(opt['datapath'], 'CornellMovie')
version = None
if not build_data.built(dpath, version_string=version):
print('[building data: ' + dpath + ']')
if build_data.built(dpath):
# An older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
# Download the data.
fname = 'cornell_movie_dialogs_corpus.tgz'
url = 'http://parl.ai/downloads/cornell_movie/' + fname
build_data.download(url, dpath, fname)
build_data.untar(dpath, fname)
dpext = os.path.join(dpath, 'cornell movie-dialogs corpus')
create_fb_format(
os.path.join(dpext, 'movie_lines.txt'),
os.path.join(dpext, 'movie_conversations.txt'),
dpath,
)
# Mark the data as built.
build_data.mark_done(dpath, version_string=version)
def build(opt):
dpath = os.path.join(opt['datapath'], 'talkthewalk')
version = 'None'
if not build_data.built(dpath, version_string=version):
print('[building data: ' + dpath + ']')
if build_data.built(dpath):
# An older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
# Download the data.
fname = 'talkthewalk.tgz'
url = 'http://parl.ai/downloads/projects/talkthewalk/' + fname
build_data.download(url, dpath, fname)
build_data.untar(dpath, fname)
# Mark the data as built.
build_data.mark_done(dpath, version_string=version)