Been pulling a lot of data from webpages recently, and here's a simple way to get
the text. It's not perfect but it's easy.
package.json
:
1
2
3
4
5
6
7
| {
"type": "module",
"dependencies": {
"@mozilla/readability": "^0.5.0",
"jsdom": "^24.0.0"
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
| import { Readability } from '@mozilla/readability';
import { JSDOM } from 'jsdom';
async function extractText(url) {
const doc = await JSDOM.fromURL(url);
let reader = new Readability(doc.window.document);
let article = reader.parse();
return article;
}
const text = await extractText( "https://willschenk.com/fragments/2024/discovering_idagio/" );
console.log( text.title );
console.log( text.textContent )
|