| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253 | const cheerio = require('cheerio');const {Document} = require('langchain/document');class WebpageLoader {    url    constructor(url){        this.url = url    }    async load(){        let response = await fetch(this.url, {        "headers": {            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",            "cache-control": "max-age=0",        },        "referrerPolicy": "strict-origin-when-cross-origin",        "body": null,        "method": "GET",        "mode": "cors",        "credentials": "include"        });        // let response = await fetch(this.url)        let pageText = await response?.text()        // console.log(pageText)        let $ = await cheerio.load(pageText);        // 移除非内容标签        $("script,style,meta,link,noscript").remove()        // 遍历所有元素        let textAll = ""        $('body > *').each(function() {            const attributes = this.attribs; // 移除所有属性            for (const attr in attributes) {                $(this).removeAttr(attr);            }            textAll += $(this).text() + "\n"            // textMap[text] = true        });             // console.log(textAll)        // return []        return [new Document({            pageContent:textAll,            metadata:{                source:this.url            }        })]    }}module.exports.WebpageLoader = WebpageLoader
 |