How to use the ipwb.indexer function in ipwb

To help you get started, we’ve selected a few ipwb examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github oduwsdl / ipwb / tests / test_randomized_add.py View on Github external
def test_push():
    """
    Read WARC, manipulate content to ensure uniqueness, push to IPFS
      WARC should result in two CDXJ entries with three space-limited fields
      each: surt URI, datetime, JSON
      JSON should contain AT LEAST locator, mime_type, and status fields
    """
    newWARCPath = ipwbTest.createUniqueWARC()
    # use ipwb indexer to push
    cdxjList = indexer.indexFileAt(newWARCPath, quiet=True)
    cdxj = '\n'.join(cdxjList)

    firstEntry = cdxj.split('\n')[0]
    firstNonMetadataEntry = ''
    for line in cdxj.split('\n'):
        if line[0] != '!':
            firstNonMetadataEntry = line
            break

    assert checkCDXJFields(firstNonMetadataEntry)
    firstEntryLastField = firstNonMetadataEntry.split(' ', 2)[2]
    assert checkIPWBJSONFieldPresesence(firstEntryLastField)
github oduwsdl / ipwb / tests / test_indexing.py View on Github external
def test_warc_ipwbIndexerBrokenWARCRecord():
    pathOfBrokenWARC = os.path.join(os.path.dirname(__file__) +
                                    '/../samples/warcs/broken.warc')
    cdxjList = indexer.indexFileAt(pathOfBrokenWARC, quiet=True)
    cdxj = '\n'.join(cdxjList)
    assert ipwbTest.countCDXJEntries(cdxj) == 1
github oduwsdl / ipwb / tests / test_indexing.py View on Github external
def test_cdxj_warc_responseRecordCount():
    newWARCPath = ipwbTest.createUniqueWARC()
    # use ipwb indexer to push
    cdxjList = indexer.indexFileAt(newWARCPath, quiet=True)
    cdxj = '\n'.join(cdxjList)
    assert ipwbTest.countCDXJEntries(cdxj) == 2
github oduwsdl / ipwb / tests / testUtil.py View on Github external
def startReplay(warcFilename):
    global p
    pathOfWARC = os.path.join(os.path.dirname(__file__) +
                              '/../samples/warcs/' + warcFilename)
    tempFilePath = tempfile.gettempdir() + '/' + ''.join(random.sample(
        string.ascii_uppercase + string.digits * 6, 12)) + '.cdxj'

    open(tempFilePath, 'a').close()  # Create placeholder file for replay

    p = Process(target=replay.start, args=[tempFilePath])
    p.start()
    sleep(5)

    cdxjList = indexer.indexFileAt(pathOfWARC, quiet=True)
    cdxj = '\n'.join(cdxjList)

    with open(tempFilePath, 'w') as f:
        f.write(cdxj)
github oduwsdl / ipwb / ipwb / __main__.py View on Github external
def checkArgs_index(args):
    if not ipwbUtil.isDaemonAlive():
        sys.exit()
    encKey = None
    compressionLevel = None
    if args.e:
        encKey = ''
    if args.c:
        compressionLevel = 6  # Magic 6, TA-DA!

    indexer.indexFileAt(args.warcPath, encKey, compressionLevel,
                        args.compressFirst, outfile=args.outfile,
                        debug=args.debug)
github oduwsdl / ipwb / ipwb / replay.py View on Github external
# if user does not select file, browser also
    # submit an empty part without filename
    if file.filename == '':
        flash('No selected file')
        return resp
    if file and allowed_file(file.filename):
        filename = secure_filename(file.filename)
        warcPath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        file.save(warcPath)

        # TODO: Check if semaphore lock exists, log it if so, wait for the lock
        # to be released, and create a new lock

        print('Indexing file from uploaded WARC at {0} to {1}'.format(
            warcPath, app.cdxjFilePath))
        indexer.indexFileAt(warcPath, outfile=app.cdxjFilePath)
        print('Index updated at {0}'.format(app.cdxjFilePath))

        app.cdxjFileContents = getIndexFileContents(app.cdxjFilePath)

        # TODO: Release semaphore lock
        resp.location = request.referrer

        return resp