How to use metascraper - 10 common examples

To help you get started, we’ve selected a few metascraper examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github coralproject / talk / jobs / scraper / index.js View on Github external
const Asset = require('../../models/asset');
const scraper = require('../../services/scraper');
const Assets = require('../../services/assets');
const { createLogger } = require('../../services/logging');
const logger = createLogger('jobs:scraper');
const fetch = require('node-fetch');
const { merge } = require('lodash');
const { version } = require('../../package.json');
const { SCRAPER_HEADERS } = require('../../config');

// Load the scraper with the rules.
const metascraper = require('metascraper').load([
  require('metascraper-title')(),
  require('metascraper-description')(),
  require('metascraper-image')(),
  require('metascraper-author')(),
  require('metascraper-date')(),
  require('./rules/modified')(),
  require('./rules/section')(),
]);

let customHeaders = {};
try {
  customHeaders = JSON.parse(SCRAPER_HEADERS);
} catch (err) {
  console.error('Cannot parse TALK_SCRAPER_HEADERS');
  throw err;
}
github coralproject / talk / services / scraper / index.js View on Github external
const fetch = require('node-fetch');
const ProxyAgent = require('proxy-agent');
const { merge } = require('lodash');

const { SCRAPER_HEADERS, SCRAPER_PROXY_URL } = require('../../config');
const kue = require('../kue');
const { version } = require('../../package.json');

// Load the scraper with the rules.
const metascraper = require('metascraper').load([
  require('metascraper-title')(),
  require('metascraper-description')(),
  require('metascraper-image')(),
  require('metascraper-author')(),
  require('metascraper-date')(),
  require('./rules/modified')(),
  require('./rules/section')(),
]);

let customHeaders = {};
try {
  customHeaders = JSON.parse(SCRAPER_HEADERS);
} catch (err) {
  console.error('Cannot parse TALK_SCRAPER_HEADERS');
  throw err;
}
github Human-Connection / Embed-API / src / services / embeds / embeds.class.js View on Github external
/* eslint-disable no-unused-vars */
const errors = require('@feathersjs/errors');
const mongoose = require('mongoose');
const { URL } = require('url');
const metascraper = require('metascraper').load([
  require('metascraper-date')(),
  require('metascraper-title')(),
  require('metascraper-description')(),
  require('metascraper-image')()
]);
const got = require('got');

const Metaphor = require('metaphor');
const engine = new Metaphor.Engine({
  preview: false,
  tweet: true
});

const getMetadata = async (targetURL, Provider) => {
  const data = {
    metaphor: {},
github Human-Connection / API / server / services / contributions / hooks / metascraper.js View on Github external
// get link metadata
// TODO: add more services and use the metascraper to fill some metadata on the article

const metascraper = require('metascraper').load([
  require('metascraper-author')(),
  require('metascraper-date')(),
  require('metascraper-description')(),
  require('metascraper-image')(),
  require('metascraper-logo')(),
  require('metascraper-clearbit-logo')(),
  require('metascraper-logo-favicon')(),
  require('metascraper-publisher')(),
  require('metascraper-title')(),
  require('metascraper-url')(),
  require('metascraper-youtube')(),
]);
const got = require('got');
const _ = require('lodash');

const getMetadata = async (targetUrl, app) => {
github Human-Connection / Human-Connection / backend / src / schema / resolvers / embeds / scraper.js View on Github external
import Metascraper from 'metascraper'
import fetch from 'node-fetch'

import { ApolloError } from 'apollo-server'
import isEmpty from 'lodash/isEmpty'
import isArray from 'lodash/isArray'
import mergeWith from 'lodash/mergeWith'
import findProvider from './findProvider'

const error = require('debug')('embed:error')

const metascraper = Metascraper([
  require('metascraper-author')(),
  require('metascraper-date')(),
  require('metascraper-description')(),
  require('metascraper-image')(),
  require('metascraper-lang')(),
  require('metascraper-lang-detector')(),
  require('metascraper-logo')(),
  // require('metascraper-clearbit-logo')(),
  require('metascraper-publisher')(),
  require('metascraper-title')(),
  require('metascraper-url')(),
  require('metascraper-audio')(),
  require('metascraper-soundcloud')(),
  require('metascraper-video')(),
  require('metascraper-youtube')(),
github NullVoxPopuli / tanqueReact / js / components / chat / message-list / message-row / message-content / index.jsx View on Github external
getTags(url) {
    Metascraper
      .scrapeUrl(url)
      .then(metadata => {
        const hasTags = !_.isEmpty(metadata);
        this.setState({ tags: metadata, hasTags });
      })
      .catch(console.info);
  }
github withspectrum / micro-open-graph / index.js View on Github external
const { parse } = require('url')
const { send } = require('micro')
const got = require('got');
const cache = require('memory-cache')

const metascraper = require('metascraper').load([
  require('metascraper-author')(),
  require('metascraper-date')(),
  require('metascraper-description')(),
  require('metascraper-image')(),
  require('metascraper-logo')(),
  require('metascraper-clearbit-logo')(),
  require('metascraper-logo-favicon')(),
  require('metascraper-publisher')(),
  require('metascraper-title')(),
	require('metascraper-url')(),
	require('metascraper-logo-favicon')(),
	require('metascraper-amazon')(),
	require('metascraper-youtube')(),
	require('metascraper-soundcloud')(),
	require('metascraper-video-provider')()
])
github olymp / olymp / packages / scrape / server / graphql.es6 View on Github external
scrape: async (source, { url }) => {
        const { body: html } = await got(url);
        const metadata = await Metascraper({ html, url });
        return {
          ...metadata,
          id: url,
        };
        return Metascraper.scrapeUrl(url, getRules(new URL(url)))
          .then(getImages)
          .then(metadata => ({
            ...metadata,
            id: url,
          }))
          .catch(() => ({}))
      }
    },
github jaredpalmer / react-email-workflow / web / extract.js View on Github external
router.post('/', (req, res, next) => {
  if (!req.body.url) {
    return res.status(400).json({
      type: 'error',
      error_code: 400,
      error_message: 'Invalid request. Missing url',
    });
  }
  const timer = logger.time('extract.post').namespace(req.body.url);
  Metascraper.scrapeUrl(req.body.url).then(
    data => {
      const payload = {
        url: data.url || req.body.url || '',
        title: data.title || 'Unable to scrape title.',
        content:
          data.description ||
          "Error: Unable to scrape description from the provided url. You'll have to do this on your own.",
        author: data.publisher || 'Unable to scrape author.',
        image: data.image || '',
      };
      cache.put(req.body.url, payload, TWENTY_FOUR_HOURS);
      logger.log(Object.assign({}, { type: 'info' }, payload));
      res.status(200).json(payload);
    },
    e => {
      timer.log();
github jaredpalmer / react-email-workflow / services / extract.js View on Github external
function onRequestDataExtraction(message, reply) {
    logger.log(message);
    const timer = logger.time('extract.post').namespace(message);
    const cachedResult = cache.get(message.url);
    if (cachedResult) {
      return reply(cachedResult);
    }
    Metascraper.scrapeUrl(message.url)
      .then(data => {
        timer.log();
        const payload = {
          url: data.url || message.url,
          title: data.title || 'Unable to scrape title.',
          content: data.description ||
            "Error: Unable to scrape description from the provided url. You'll have to do this on your own.",
          author: data.publisher || 'Unable to scrape author.',
          image: data.image || '',
        };
        cache.put(message.url, payload, TWENTY_FOUR_HOURS);
        logger.log(Object.assign({}, { type: 'info' }, payload));
        reply(payload);
      })
      .catch(e => {
        timer.log();

metascraper

A library to easily scrape metadata from an article on the web using Open Graph, JSON+LD, regular HTML metadata, and series of fallbacks.

MIT
Latest version published 30 days ago

Package Health Score

89 / 100
Full package analysis