Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
return scoreIfHas;
}
return 1;
};
const rules = ruleset(
// Isolate the actual blog post body text. Based on Fathom's example
// Readability rules
rule(dom('p,li,ol,ul,code,blockquote,pre,h1,h2,h3,h4,h5,h6'),
props(scoreByLength).type('paragraphish')),
rule(type('paragraphish'), score(byInverseLinkDensity)),
rule(dom('p'), score(4.5).type('paragraphish')),
// Tweaks for this particular blog
rule(type('paragraphish'), score(hasAncestor('article', 10))),
rule(dom('.entry-summary p'), score(0).type('paragraphish')),
rule(dom('figure'), props(scoreByImageSize).type('paragraphish')),
// Find the best cluster of paragraph-ish nodes
rule(
type('paragraphish').bestCluster({
splittingDistance: 3,
differentDepthCost: 6.5,
differentTagCost: 2,
sameTagCost: 0.5,
strideCost: 0,
}),
out('content').allThrough(Futils.domSort)));
async function ingestArticle(hatch, {title, link, date, author}) {
let $ = await Libingester.util.fetch_html(link);
const baseURI = Libingester.util.get_doc_base_uri($, link);
for (let element = fnode.element, parent;
(parent = element.parentNode) != null &&
parent.nodeType === parent.ELEMENT_NODE;
element = parent) {
if (element.tagName.toLowerCase() === lowerTag)
return scoreIfHas;
}
return 1;
};
const rules = ruleset(
// Isolate the actual blog post body text. Based on Fathom's example
// Readability rules
rule(dom('p,li,ol,ul,code,blockquote,pre,h1,h2,h3,h4,h5,h6'),
props(scoreByLength).type('paragraphish')),
rule(type('paragraphish'), score(byInverseLinkDensity)),
rule(dom('p'), score(4.5).type('paragraphish')),
// Tweaks for this particular blog
rule(type('paragraphish'), score(hasAncestor('article', 10))),
rule(dom('.entry-summary p'), score(0).type('paragraphish')),
rule(dom('figure'), props(scoreByImageSize).type('paragraphish')),
// Find the best cluster of paragraph-ish nodes
rule(
type('paragraphish').bestCluster({
splittingDistance: 3,
differentDepthCost: 6.5,
differentTagCost: 2,
sameTagCost: 0.5,
strideCost: 0,
}),
(parent = element.parentNode) != null &&
parent.nodeType === parent.ELEMENT_NODE;
element = parent) {
if (element.tagName.toLowerCase() === lowerTag)
return scoreIfHas;
}
return 1;
};
const rules = ruleset(
// Isolate the actual blog post body text. Based on Fathom's example
// Readability rules
rule(dom('p,li,ol,ul,code,blockquote,pre,h1,h2,h3,h4,h5,h6'),
props(scoreByLength).type('paragraphish')),
rule(type('paragraphish'), score(byInverseLinkDensity)),
rule(dom('p'), score(4.5).type('paragraphish')),
// Tweaks for this particular blog
rule(type('paragraphish'), score(hasAncestor('article', 10))),
rule(dom('.entry-summary p'), score(0).type('paragraphish')),
rule(dom('figure'), props(scoreByImageSize).type('paragraphish')),
// Find the best cluster of paragraph-ish nodes
rule(
type('paragraphish').bestCluster({
splittingDistance: 3,
differentDepthCost: 6.5,
differentTagCost: 2,
sameTagCost: 0.5,
strideCost: 0,
}),
out('content').allThrough(Futils.domSort)));
return scoreIfHas;
}
return 1;
};
const rules = ruleset(
// Isolate the actual blog post body text. Based on Fathom's example
// Readability rules
rule(dom('p,li,ol,ul,code,blockquote,pre,h1,h2,h3,h4,h5,h6'),
props(scoreByLength).type('paragraphish')),
rule(type('paragraphish'), score(byInverseLinkDensity)),
rule(dom('p'), score(4.5).type('paragraphish')),
// Tweaks for this particular blog
rule(type('paragraphish'), score(hasAncestor('article', 10))),
rule(dom('.entry-summary p'), score(0).type('paragraphish')),
rule(dom('figure'), props(scoreByImageSize).type('paragraphish')),
rule(dom('.jetpack-video-wrapper'), props(() => ({
score: 100,
note: {length: 1},
})).type('paragraphish')),
// Find the best cluster of paragraph-ish nodes
rule(
type('paragraphish').bestCluster({
splittingDistance: 3,
differentDepthCost: 6.5,
differentTagCost: 2,
sameTagCost: 0.5,
strideCost: 0,
}),
out('content').allThrough(Futils.domSort)));
for (let element = fnode.element, parent;
(parent = element.parentNode) != null &&
parent.nodeType === parent.ELEMENT_NODE;
element = parent) {
if (element.tagName.toLowerCase() === lowerTag)
return scoreIfHas;
}
return 1;
};
const rules = ruleset(
// Isolate the actual blog post body text. Based on Fathom's example
// Readability rules
rule(dom('p,li,ol,ul,code,blockquote,pre,h1,h2,h3,h4,h5,h6'),
props(scoreByLength).type('paragraphish')),
rule(type('paragraphish'), score(byInverseLinkDensity)),
rule(dom('p'), score(4.5).type('paragraphish')),
// Tweaks for this particular blog
rule(type('paragraphish'), score(hasAncestor('article', 10))),
rule(dom('.entry-summary p'), score(0).type('paragraphish')),
rule(dom('figure'), props(scoreByImageSize).type('paragraphish')),
rule(dom('.jetpack-video-wrapper'), props(() => ({
score: 100,
note: {length: 1},
})).type('paragraphish')),
// Find the best cluster of paragraph-ish nodes
rule(
type('paragraphish').bestCluster({
splittingDistance: 3,
differentDepthCost: 6.5,
*/
const scoreByLength = ({ element }) => ({
score: inlineTextLength(element),
})
// Based on: https://hacks.mozilla.org/2017/04/fathom-a-framework-for-understanding-web-pages/
// Meant to be similar to Readability-like extraction of a page's main-content
// Initial tests of this are pretty innaccurate; lots to learn to be able to tweak the rules and use it well
const rules = ruleset(
rule(
dom('p,div,li,blockquote,h1,h2,h3,h4,h5,h6'),
props(scoreByLength).type('paragraphish'),
),
rule(
type('paragraphish'),
score(fnode => {
const paragraphishNote = fnode.noteFor('paragraphish')
return paragraphishNote
? (1 - linkDensity(fnode, paragraphishNote.inlineLength)) * 1.5
: (1 - linkDensity(fnode)) * 1.5
}),
),
rule(dom('p'), score(4.5).type('paragraphish')),
rule(
type('paragraphish').bestCluster({
splittingDistance: 3,
differentDepthCost: 6.5,
differentTagCost: 2,
sameTagCost: 0.5,
strideCost: 0,
}),
out('content').allThrough(domSort),
makeRuleset(coeffs, biases) {
return ruleset([
/**
* Image rules
*/
// consider all visible img elements
rule(dom('img').when(this.isVisible.bind(this)), note(() => ({isVisible: true})).type('image')),
// and divs, which sometimes have CSS background-images
// TODO: Consider a bonus for <img> tags.
rule(dom('div').when(fnode => this.isVisible(fnode) && this.hasBackgroundImage(fnode)), type('image')),
// better score the closer the element is to the top of the page
rule(type('image'), score(this.isAboveTheFold.bind(this)), {name: 'isAboveTheFoldImage'}),
// better score for larger images
rule(type('image'), score(this.isBig.bind(this)), {name: 'isBig'}),
// bonus for non-extreme aspect ratios, to filter out banners or nav elements
// TODO: Meant to make this a penalty, but it turns out to work as is.
// Try as a penalty.
rule(type('image'), score(this.hasSquareAspectRatio.bind(this)), {name: 'hasSquareAspectRatio'}),
// no background images, even ones that have reasonable aspect ratios
// TODO: If necessary, also look at parents. I've seen them say
// "background" in their IDs as well.
rule(type('image'), score(this.hasBackgroundInID.bind(this)), {name: 'hasBackgroundInID'}),
// return image element(s) with max score
rule(type('image').max(), out('image')),
/**
* Title rules
*/
function saturation(r, g, b) {
const cMax = Math.max(r, g, b);
const cMin = Math.min(r, g, b);
const delta = cMax - cMin;
const lightness = (cMax + cMin) / 2;
const denom = (1 - (Math.abs(2 * lightness - 1)));
// Return 0 if it's black (R, G, and B all 0).
return (denom === 0) ? 0 : delta / denom;
}
/* The actual ruleset */
const rules = ruleset([
rule(dom('div'), type('overlay')),
rule(type('overlay'), score(big), {name: 'big'}),
rule(type('overlay'), score(nearlyOpaque), {name: 'nearlyOpaque'}),
rule(type('overlay'), score(monochrome), {name: 'monochrome'}),
rule(type('overlay'), score(suspiciousClassOrId), {name: 'classOrId'}),
rule(type('overlay'), score(visible), {name: 'visible'}),
rule(type('overlay').max(), out('overlay'))
]);
return rules;
}
}
if (element.tagName.toLowerCase() === lowerTag)
return scoreIfHas;
}
return 1;
};
const rules = ruleset(
// Isolate the actual blog post body text. Based on Fathom's example
// Readability rules
rule(dom('p,li,ol,ul,code,blockquote,pre,h1,h2,h3,h4,h5,h6'),
props(scoreByLength).type('paragraphish')),
rule(type('paragraphish'), score(byInverseLinkDensity)),
rule(dom('p'), score(4.5).type('paragraphish')),
// Tweaks for this particular blog
rule(type('paragraphish'), score(hasAncestor('article', 10))),
rule(dom('.entry-summary p'), score(0).type('paragraphish')),
rule(dom('figure'), props(scoreByImageSize).type('paragraphish')),
// Find the best cluster of paragraph-ish nodes
rule(
type('paragraphish').bestCluster({
splittingDistance: 3,
differentDepthCost: 6.5,
differentTagCost: 2,
sameTagCost: 0.5,
strideCost: 0,
}),
out('content').allThrough(Futils.domSort)));
async function ingestArticle(hatch, {title, link, date, author}) {
let $ = await Libingester.util.fetch_html(link);
const cMin = Math.min(r, g, b);
const delta = cMax - cMin;
const lightness = (cMax + cMin) / 2;
const denom = (1 - (Math.abs(2 * lightness - 1)));
// Return 0 if it's black (R, G, and B all 0).
return (denom === 0) ? 0 : delta / denom;
}
/* The actual ruleset */
const rules = ruleset([
rule(dom('div'), type('overlay')),
rule(type('overlay'), score(big), {name: 'big'}),
rule(type('overlay'), score(nearlyOpaque), {name: 'nearlyOpaque'}),
rule(type('overlay'), score(monochrome), {name: 'monochrome'}),
rule(type('overlay'), score(suspiciousClassOrId), {name: 'classOrId'}),
rule(type('overlay'), score(visible), {name: 'visible'}),
rule(type('overlay').max(), out('overlay'))
]);
return rules;
}
}