How to use the pymarc.marcxml function in pymarc

To help you get started, we’ve selected a few pymarc examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github miku / siskin / siskin / assets / 14 / 14_marcxml_sru.py View on Github external
sys.exit("Anfrage nicht erfolgreich")

    if len(oldrecords) == 0:
        sys.exit("leere Liste")

    for oldrecord in oldrecords[:-1]:

        match = re.search("(\.*?\)", oldrecord)

        if not match:
            sys.exit("kein record-Element")

        oldrecord = match.group(1)
        oldrecord = oldrecord.encode("utf-8")
        oldrecord = BytesIO(oldrecord)
        oldrecord = pymarc.marcxml.parse_xml_to_array(oldrecord)
        oldrecord = oldrecord[0]

        newrecord = marcx.Record.from_record(oldrecord)
        newrecord.force_utf8 = True
        newrecord.strict = False

        # Prüfen, ob ein Titel vorhanden ist
        try:
            newrecord["245"]["a"]
        except:
            continue

        # Prüfen, ob es sich um Digitalisat handelt bzw. ein Link zu einem Digitalisat enthalten ist
        f856 = newrecord["856"]
        if not f856 or not has_digitalization_links(newrecord):
            continue
github miku / siskin / siskin / assets / 68 / 68_marcbinary.py View on Github external
title = title.replace("/", "")
    title = title.replace(")", "")
    title = title.replace("(", "")
    title = title.replace("-", "")
    title = title.lower()
    #title = title.split(":")[0]

    for issn in issns:
        issn = issn.replace("-", "")
        id = issn + title
        ids.add(id)

for oldrecord in xmlstream(inputfilename, "record"):

    oldrecord = BytesIO(oldrecord)
    oldrecord = pymarc.marcxml.parse_xml_to_array(oldrecord)
    oldrecord = oldrecord[0]

    record = marcx.Record.from_record(oldrecord)
    record.force_utf8 = True
    record.strict = False

    # Identifikator
    f001 = record["001"].data
    record.remove_fields("001")
    record.add("001", data="68-" + f001)

    # Zugangstyp
    record.add("007", data="cr")

    # ISSN-Check
    try:
github slub / efre-lod-elasticsearch-tools / helperscripts / fincsolr2marc.py View on Github external
args=parser.parse_args()
    if args.help:
        parser.print_help(sys.stderr)
        exit()        

    for line in sys.stdin:
        record=json.loads(line)
        if record and record.get("recordtype") and args.format=="marc" and "marc" in record.get("recordtype") and not "xml" in record.get("recordtype"):
            marcFullRecordFixed=fixRecord(record=record.get(args.frfield),record_id=record.get("record_id"),validation=args.valid,replaceMethod=args.replaceMethod)
            if not args.toJson:
                sys.stdout.write(marcFullRecordFixed)
            else:
                for record in MARCReader(marcFullRecordFixed.encode('utf-8'), to_unicode=True):
                    sys.stdout.write(json.dumps(transpose_to_ldj(record)))
        elif record and record.get("recordtype") and "marcxml" in record.get("recordtype") and args.format=="marcxml":
                pymarc.marcxml.parse_xml_to_array(StringIO(record.get(args.frfield))) #need wrapper in StringIO for read()-need in marcxml lib
github miku / siskin / siskin / assets / 183 / 183_marcxml_sru.py View on Github external
sys.exit("Anfrage nicht erfolgreich")

    if len(oldrecords) == 0:
        sys.exit("leere Liste")

    for oldrecord in oldrecords[:-1]:

        match = re.search("(\.*?\)", oldrecord)  #327963263

        if not match:
            sys.exit("kein record-Element")

        oldrecord = match.group(1)
        oldrecord = oldrecord.encode("latin-1")
        oldrecord = BytesIO(oldrecord)
        oldrecord = pymarc.marcxml.parse_xml_to_array(oldrecord)
        oldrecord = oldrecord[0]

        newrecord = marcx.Record.from_record(oldrecord)
        newrecord.force_utf8 = True
        newrecord.strict = False

        try:
            newrecord["245"]["a"]
        except:
            continue

        # Identifikator
        f001 = newrecord["001"].data
        newrecord.remove_fields("001")
        newrecord.add("001", data="183-" + f001)
github miku / siskin / siskin / sources / ceeol.py View on Github external
def convert_ceeol_to_intermediate_schema(filename):
    """
    Given an XML filename, convert data to intermediate schema. Note: The XML
    refers to the partially broken XML supplied as of 07/2019.

    File must fit into memory.

    Yields converted dictionaries.
    """
    article_url_re = re.compile(r"https://www.ceeol.com/search/article-detail\?id=([0-9]+)")

    with open(filename) as handle:
        records = pymarc.marcxml.parse_xml_to_array(handle)
        for record in records:
            record = marcx.Record.from_record(record)

            # Try to find ID.
            for url in record.itervalues("856.u"):
                match = article_url_re.search(url)
                if not match:
                    continue
                record_id = match.group(1)
                break
            else:
                raise ValueError("missing record id")

            doc = {
                "abstract": record.firstvalue("520.a", default=""),
                "authors": [{
github miku / siskin / siskin / assets / 39 / 39_marcbinary.py View on Github external
inputfilename = "39_input.xml"
outputfilename = "39_output.mrc"
filterfilename = "39_issn_filter"

if len(sys.argv) >= 4:
    inputfilename, outputfilename, filterfilename = sys.argv[1:4]

outputfile = io.open(outputfilename, "wb")
filterfile = io.open(filterfilename, "r")

issn_list = filterfile.readlines()
issn_list = [issn.rstrip("\n") for issn in issn_list]

for record in xmlstream(inputfilename, "record"):
    record = BytesIO(record)
    record = pymarc.marcxml.parse_xml_to_array(record)
    record = record[0]

    record = marcx.Record.from_record(record)
    record.force_utf8 = True
    record.strict = False

    # prüfen, ob Titel vorhanden ist
    if not record["245"]:
        continue

    # Leader
    record.leader = "     " + record.leader[5:]

    # Identifikator
    f001 = record["001"].data
    record.remove_fields("001")
github miku / siskin / siskin / assets / 35 / 35_marcbinary.py View on Github external
inputfilename = "35_input.xml"
outputfilename = "35_output.mrc"
lccfilename = "lcc"

if len(sys.argv) == 4:
    inputfilename, outputfilename, lccfilename = sys.argv[1:]

inputfile = open(inputfilename, "r", encoding='utf-8')
outputfile = open(outputfilename, "wb")
lccfile = open(lccfilename, "r")
lccs = lccfile.readlines()

for oldrecord in xmlstream(inputfilename, "record"):

    oldrecord = BytesIO(oldrecord)
    oldrecord = pymarc.marcxml.parse_xml_to_array(oldrecord)
    oldrecord = oldrecord[0]

    marcrecord = marcx.Record.from_record(oldrecord)
    marcrecord.force_utf8 = True
    marcrecord.strict = False

    # Leader
    marcrecord.leader = "     " + marcrecord.leader[5:]

    # Identifikator
    f001 = marcrecord["856"]["u"]
    match = re.search(".*\/(.*)", f001)
    if match:
        f001 = match.group(1)
        f001 = f001.replace(".", "")
        marcrecord.remove_fields("001")
github miku / siskin / siskin / assets / 183 / 183_marcbinary.py View on Github external
if len(sys.argv) == 3:
    input_directory, outputfilename = sys.argv[1:]

outputfile = open(outputfilename, "wb")

for root, _, files in os.walk(input_directory):

    for filename in files:
        if not filename.endswith(".xml"):
            continue
        inputfilepath = os.path.join(root, filename)

        for oldrecord in xmlstream(inputfilepath, "record"):

            oldrecord = BytesIO(oldrecord)
            oldrecord = pymarc.marcxml.parse_xml_to_array(oldrecord)
            oldrecord = oldrecord[0]

            marcrecord = marcx.Record.from_record(oldrecord)
            marcrecord.force_utf8 = True
            marcrecord.strict = False

            try:
                marcrecord["245"]["a"]
            except:
                continue

            # Identifikator
            f001 = marcrecord["001"].data
            marcrecord.remove_fields("001")
            marcrecord.add("001", data="183-" + f001)
github gwu-libraries / launchpad / lp / ui / apis.py View on Github external
def worldcat(num, num_type, url, key):
    url = url % (num_type, num, key)
    try:
        records = marcxml.parse_xml_to_array(urlopen(url))
        if not records:
            return None
        record = records[0]
    except:
        return None
    bib = {}
    bib[num_type.upper()] = num
    bib['TITLE'] = record.uniformtitle()
    if not bib['TITLE']:
        bib['TITLE'] = record.title() if record.title() else ''
    bib['TITLE_ALL'] = bib['TITLE']
    bib['AUTHORS'] = [entry.format_field() for entry in record.addedentries()]
    bib['AUTHOR'] = record.author() if record.author() else ''
    if bib['AUTHOR']:
        bib['AUTHORS'].insert(0, bib['AUTHOR'])
    bib['PUBLISHER'] = record.publisher() if record.publisher() else ''
github miku / siskin / bin / 35_fincmarc.py View on Github external
inputfilename = build_inputfilename(args, "xml", SID)
    os.system("metha-sync -set hathitrust:pd -format marc21 https://quod.lib.umich.edu/cgi/o/oai/oai")
    os.system("metha-cat -set hathitrust:pd -format marc21 https://quod.lib.umich.edu/cgi/o/oai/oai > %s" % inputfilename)

lccfile = open(filemap, "r")
lccs = lccfile.readlines()


##################################################################################
# 3. Process data
##################################################################################

for oldrecord in xmlstream(inputfilename, "record"):

    oldrecord = BytesIO(oldrecord)
    oldrecord = pymarc.marcxml.parse_xml_to_array(oldrecord)
    oldrecord = oldrecord[0]

    marcrecord = marcx.Record.from_record(oldrecord)
    marcrecord.force_utf8 = True
    marcrecord.strict = False

    # Leader
    marcrecord.leader = "     " + marcrecord.leader[5:]

    # Identifikator
    f001 = marcrecord["856"]["u"]
    match = re.search(".*\/(.*)", f001)
    if match:
        f001 = match.group(1)
        f001 = f001.replace(".", "")
        marcrecord.remove_fields("001")