Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
sys.exit("Anfrage nicht erfolgreich")
if len(oldrecords) == 0:
sys.exit("leere Liste")
for oldrecord in oldrecords[:-1]:
match = re.search("(\.*?\)", oldrecord)
if not match:
sys.exit("kein record-Element")
oldrecord = match.group(1)
oldrecord = oldrecord.encode("utf-8")
oldrecord = BytesIO(oldrecord)
oldrecord = pymarc.marcxml.parse_xml_to_array(oldrecord)
oldrecord = oldrecord[0]
newrecord = marcx.Record.from_record(oldrecord)
newrecord.force_utf8 = True
newrecord.strict = False
# Prüfen, ob ein Titel vorhanden ist
try:
newrecord["245"]["a"]
except:
continue
# Prüfen, ob es sich um Digitalisat handelt bzw. ein Link zu einem Digitalisat enthalten ist
f856 = newrecord["856"]
if not f856 or not has_digitalization_links(newrecord):
continue
title = title.replace("/", "")
title = title.replace(")", "")
title = title.replace("(", "")
title = title.replace("-", "")
title = title.lower()
#title = title.split(":")[0]
for issn in issns:
issn = issn.replace("-", "")
id = issn + title
ids.add(id)
for oldrecord in xmlstream(inputfilename, "record"):
oldrecord = BytesIO(oldrecord)
oldrecord = pymarc.marcxml.parse_xml_to_array(oldrecord)
oldrecord = oldrecord[0]
record = marcx.Record.from_record(oldrecord)
record.force_utf8 = True
record.strict = False
# Identifikator
f001 = record["001"].data
record.remove_fields("001")
record.add("001", data="68-" + f001)
# Zugangstyp
record.add("007", data="cr")
# ISSN-Check
try:
args=parser.parse_args()
if args.help:
parser.print_help(sys.stderr)
exit()
for line in sys.stdin:
record=json.loads(line)
if record and record.get("recordtype") and args.format=="marc" and "marc" in record.get("recordtype") and not "xml" in record.get("recordtype"):
marcFullRecordFixed=fixRecord(record=record.get(args.frfield),record_id=record.get("record_id"),validation=args.valid,replaceMethod=args.replaceMethod)
if not args.toJson:
sys.stdout.write(marcFullRecordFixed)
else:
for record in MARCReader(marcFullRecordFixed.encode('utf-8'), to_unicode=True):
sys.stdout.write(json.dumps(transpose_to_ldj(record)))
elif record and record.get("recordtype") and "marcxml" in record.get("recordtype") and args.format=="marcxml":
pymarc.marcxml.parse_xml_to_array(StringIO(record.get(args.frfield))) #need wrapper in StringIO for read()-need in marcxml lib
sys.exit("Anfrage nicht erfolgreich")
if len(oldrecords) == 0:
sys.exit("leere Liste")
for oldrecord in oldrecords[:-1]:
match = re.search("(\.*?\)", oldrecord) #327963263
if not match:
sys.exit("kein record-Element")
oldrecord = match.group(1)
oldrecord = oldrecord.encode("latin-1")
oldrecord = BytesIO(oldrecord)
oldrecord = pymarc.marcxml.parse_xml_to_array(oldrecord)
oldrecord = oldrecord[0]
newrecord = marcx.Record.from_record(oldrecord)
newrecord.force_utf8 = True
newrecord.strict = False
try:
newrecord["245"]["a"]
except:
continue
# Identifikator
f001 = newrecord["001"].data
newrecord.remove_fields("001")
newrecord.add("001", data="183-" + f001)
def convert_ceeol_to_intermediate_schema(filename):
"""
Given an XML filename, convert data to intermediate schema. Note: The XML
refers to the partially broken XML supplied as of 07/2019.
File must fit into memory.
Yields converted dictionaries.
"""
article_url_re = re.compile(r"https://www.ceeol.com/search/article-detail\?id=([0-9]+)")
with open(filename) as handle:
records = pymarc.marcxml.parse_xml_to_array(handle)
for record in records:
record = marcx.Record.from_record(record)
# Try to find ID.
for url in record.itervalues("856.u"):
match = article_url_re.search(url)
if not match:
continue
record_id = match.group(1)
break
else:
raise ValueError("missing record id")
doc = {
"abstract": record.firstvalue("520.a", default=""),
"authors": [{
inputfilename = "39_input.xml"
outputfilename = "39_output.mrc"
filterfilename = "39_issn_filter"
if len(sys.argv) >= 4:
inputfilename, outputfilename, filterfilename = sys.argv[1:4]
outputfile = io.open(outputfilename, "wb")
filterfile = io.open(filterfilename, "r")
issn_list = filterfile.readlines()
issn_list = [issn.rstrip("\n") for issn in issn_list]
for record in xmlstream(inputfilename, "record"):
record = BytesIO(record)
record = pymarc.marcxml.parse_xml_to_array(record)
record = record[0]
record = marcx.Record.from_record(record)
record.force_utf8 = True
record.strict = False
# prüfen, ob Titel vorhanden ist
if not record["245"]:
continue
# Leader
record.leader = " " + record.leader[5:]
# Identifikator
f001 = record["001"].data
record.remove_fields("001")
inputfilename = "35_input.xml"
outputfilename = "35_output.mrc"
lccfilename = "lcc"
if len(sys.argv) == 4:
inputfilename, outputfilename, lccfilename = sys.argv[1:]
inputfile = open(inputfilename, "r", encoding='utf-8')
outputfile = open(outputfilename, "wb")
lccfile = open(lccfilename, "r")
lccs = lccfile.readlines()
for oldrecord in xmlstream(inputfilename, "record"):
oldrecord = BytesIO(oldrecord)
oldrecord = pymarc.marcxml.parse_xml_to_array(oldrecord)
oldrecord = oldrecord[0]
marcrecord = marcx.Record.from_record(oldrecord)
marcrecord.force_utf8 = True
marcrecord.strict = False
# Leader
marcrecord.leader = " " + marcrecord.leader[5:]
# Identifikator
f001 = marcrecord["856"]["u"]
match = re.search(".*\/(.*)", f001)
if match:
f001 = match.group(1)
f001 = f001.replace(".", "")
marcrecord.remove_fields("001")
if len(sys.argv) == 3:
input_directory, outputfilename = sys.argv[1:]
outputfile = open(outputfilename, "wb")
for root, _, files in os.walk(input_directory):
for filename in files:
if not filename.endswith(".xml"):
continue
inputfilepath = os.path.join(root, filename)
for oldrecord in xmlstream(inputfilepath, "record"):
oldrecord = BytesIO(oldrecord)
oldrecord = pymarc.marcxml.parse_xml_to_array(oldrecord)
oldrecord = oldrecord[0]
marcrecord = marcx.Record.from_record(oldrecord)
marcrecord.force_utf8 = True
marcrecord.strict = False
try:
marcrecord["245"]["a"]
except:
continue
# Identifikator
f001 = marcrecord["001"].data
marcrecord.remove_fields("001")
marcrecord.add("001", data="183-" + f001)
def worldcat(num, num_type, url, key):
url = url % (num_type, num, key)
try:
records = marcxml.parse_xml_to_array(urlopen(url))
if not records:
return None
record = records[0]
except:
return None
bib = {}
bib[num_type.upper()] = num
bib['TITLE'] = record.uniformtitle()
if not bib['TITLE']:
bib['TITLE'] = record.title() if record.title() else ''
bib['TITLE_ALL'] = bib['TITLE']
bib['AUTHORS'] = [entry.format_field() for entry in record.addedentries()]
bib['AUTHOR'] = record.author() if record.author() else ''
if bib['AUTHOR']:
bib['AUTHORS'].insert(0, bib['AUTHOR'])
bib['PUBLISHER'] = record.publisher() if record.publisher() else ''
inputfilename = build_inputfilename(args, "xml", SID)
os.system("metha-sync -set hathitrust:pd -format marc21 https://quod.lib.umich.edu/cgi/o/oai/oai")
os.system("metha-cat -set hathitrust:pd -format marc21 https://quod.lib.umich.edu/cgi/o/oai/oai > %s" % inputfilename)
lccfile = open(filemap, "r")
lccs = lccfile.readlines()
##################################################################################
# 3. Process data
##################################################################################
for oldrecord in xmlstream(inputfilename, "record"):
oldrecord = BytesIO(oldrecord)
oldrecord = pymarc.marcxml.parse_xml_to_array(oldrecord)
oldrecord = oldrecord[0]
marcrecord = marcx.Record.from_record(oldrecord)
marcrecord.force_utf8 = True
marcrecord.strict = False
# Leader
marcrecord.leader = " " + marcrecord.leader[5:]
# Identifikator
f001 = marcrecord["856"]["u"]
match = re.search(".*\/(.*)", f001)
if match:
f001 = match.group(1)
f001 = f001.replace(".", "")
marcrecord.remove_fields("001")