Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# conf corpus
corpus_service.clean_corpus(corpus)
corpus_service.conf_corpus(corpus, "aime")
date_clean = datetime.datetime.now()
chrono.chrono_trace("Clean and conf corpus", date_begin, date_clean, None)
# import
corpus_service.import_metadata_file(corpus, input_file_path, input_format, source, rec_id_prefix, True, None)
date_import = datetime.datetime.now()
chrono.chrono_trace("Import corpus", date_clean, date_import, None)
# Validate perio
documents = repository_service.get_documents(corpus)
validate_perios(documents, csv_file_path)
date_validate = datetime.datetime.now()
chrono.chrono_trace("Validate perio", date_import, date_validate, None)
def harvest_by_ids(corpus, target, ids):
logging.info("harvest_by_ids: {}".format(ids))
date_begin = datetime.datetime.now()
# harvest
metajson_list = []
for identifier in ids:
metajson_list.append(oaipmh_harvester.get_record(target, identifier))
date_harvest = datetime.datetime.now()
chrono.chrono_trace("harvest spire and convert to metajson", date_begin, date_harvest, len(ids))
# import
result_import = corpus_service.import_metajson_list(corpus, metajson_list, True, None)
date_import = datetime.datetime.now()
chrono.chrono_trace("import", date_harvest, date_import, len(result_import))
total_count = 0
logging.info("# Import common fields:")
if results_fields_common:
for entry in results_fields_common:
total_count += 1
logging.info("rec_type: {}, _id: {}".format(entry["rec_type"], entry["_id"]))
else:
logging.info("Empty common fields")
logging.info("# Import {} fields:".format(corpus))
if results_fields_corpus:
for entry in results_fields_corpus:
total_count += 1
logging.info("rec_type: {}, _id: {}".format(entry["rec_type"], entry["_id"]))
else:
logging.info("Empty {} fields".format(corpus))
chrono.chrono_trace("conf_fields", date_types, date_fields, total_count)
chrono.chrono_trace("Validate corpus", date_path, date_validate, None)
# export MetaJSON
corpus_service.export_corpus(corpus, metajson_file_path, constants.FORMAT_METAJSON, True)
date_export_metajson = datetime.datetime.now()
chrono.chrono_trace("Export corpus as MetaJSON", date_validate, date_export_metajson, None)
# export MODS
corpus_service.export_corpus(corpus, mods_file_path, constants.FORMAT_MODS, True)
date_export_mods = datetime.datetime.now()
chrono.chrono_trace("Export corpus as MODS", date_export_metajson, date_export_mods, None)
# export RePEc
corpus_service.export_corpus(corpus, repec_file_path, constants.FORMAT_REPEC, True)
date_export_repec = datetime.datetime.now()
chrono.chrono_trace("Export corpus as RePEc", date_export_mods, date_export_repec, None)
date_begin = datetime.datetime.now()
# conf params
corpus = "perio"
source = "Sciences Po | la bibliothèque"
rec_id_prefix = ""
input_file_path = os.path.join("data", "unimarc", "periouni.mrc")
input_format = constants.FORMAT_UNIMARC
csv_file_name = "".join(["validation-", corpus, ".csv"])
csv_file_path = os.path.join("data", "result", csv_file_name)
# conf corpus
corpus_service.clean_corpus(corpus)
corpus_service.conf_corpus(corpus, "aime")
date_clean = datetime.datetime.now()
chrono.chrono_trace("Clean and conf corpus", date_begin, date_clean, None)
# import
corpus_service.import_metadata_file(corpus, input_file_path, input_format, source, rec_id_prefix, True, None)
date_import = datetime.datetime.now()
chrono.chrono_trace("Import corpus", date_clean, date_import, None)
# Validate perio
documents = repository_service.get_documents(corpus)
validate_perios(documents, csv_file_path)
date_validate = datetime.datetime.now()
chrono.chrono_trace("Validate perio", date_import, date_validate, None)
error_file_path = os.path.join(data_result_dir, "result_validation_errors.txt")
metajson_file_path = os.path.join(data_result_dir, "result_didl_metajson_spire.json")
mods_file_path = os.path.join(data_result_dir, "result_didl_mods_spire.json")
repec_file_path = os.path.join(data_result_dir, "result_repec.txt")
date_path = datetime.datetime.now()
# validate
corpus_service.validate_corpus(corpus, error_file_path)
date_validate = datetime.datetime.now()
chrono.chrono_trace("Validate corpus", date_path, date_validate, None)
# export MetaJSON
corpus_service.export_corpus(corpus, metajson_file_path, constants.FORMAT_METAJSON, True)
date_export_metajson = datetime.datetime.now()
chrono.chrono_trace("Export corpus as MetaJSON", date_validate, date_export_metajson, None)
# export MODS
corpus_service.export_corpus(corpus, mods_file_path, constants.FORMAT_MODS, True)
date_export_mods = datetime.datetime.now()
chrono.chrono_trace("Export corpus as MODS", date_export_metajson, date_export_mods, None)
# export RePEc
corpus_service.export_corpus(corpus, repec_file_path, constants.FORMAT_REPEC, True)
date_export_repec = datetime.datetime.now()
chrono.chrono_trace("Export corpus as RePEc", date_export_mods, date_export_repec, None)
def clean_corpus(corpus):
if not corpus:
logging.error("Error: empty corpus")
else:
logging.info("clean corpus: {}".format(corpus))
date_begin = datetime.datetime.now()
repository_service.create_corpus(corpus)
repository_service.empty_corpus(corpus)
repository_service.init_corpus_indexes(corpus)
date_end = datetime.datetime.now()
chrono.chrono_trace("clean_corpus", date_begin, date_end, None)
# import
input_file_paths = io_service.get_relevant_file_list_by_format(input_dir_path, input_format)
results = corpus_service.import_metadata_files(corpus, input_file_paths, input_format, source, rec_id_prefix, True, None)
date_import = datetime.datetime.now()
chrono.chrono_trace("Import corpus", date_clean, date_import, None)
# Validate
corpus_service.validate_corpus(corpus, error_file_path)
date_validate = datetime.datetime.now()
chrono.chrono_trace("Validate corpus", date_import, date_validate, None)
# Export mods
corpus_service.export_corpus(corpus, output_dir_path, constants.FORMAT_MODS, False, True)
date_export_mods = datetime.datetime.now()
chrono.chrono_trace("Export corpus mods", date_validate, date_export_mods, None)
# Export oai_dc
corpus_service.export_corpus(corpus, output_dir_path, constants.FORMAT_OAI_DC, False, True)
date_export_oai_dc = datetime.datetime.now()
chrono.chrono_trace("Export corpus oai_dc", date_export_mods, date_export_oai_dc, None)
date_path = datetime.datetime.now()
# validate
corpus_service.validate_corpus(corpus, error_file_path)
date_validate = datetime.datetime.now()
chrono.chrono_trace("Validate corpus", date_path, date_validate, None)
# export MetaJSON
corpus_service.export_corpus(corpus, metajson_file_path, constants.FORMAT_METAJSON, True)
date_export_metajson = datetime.datetime.now()
chrono.chrono_trace("Export corpus as MetaJSON", date_validate, date_export_metajson, None)
# export MODS
corpus_service.export_corpus(corpus, mods_file_path, constants.FORMAT_MODS, True)
date_export_mods = datetime.datetime.now()
chrono.chrono_trace("Export corpus as MODS", date_export_metajson, date_export_mods, None)
# export RePEc
corpus_service.export_corpus(corpus, repec_file_path, constants.FORMAT_REPEC, True)
date_export_repec = datetime.datetime.now()
chrono.chrono_trace("Export corpus as RePEc", date_export_mods, date_export_repec, None)