| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180 | 
const path = require("path");const fs = require("fs");const crypto = require("crypto");// 网络下载const download = require("download");// 加载器const { DocxLoader } = require("langchain/document_loaders/fs/docx");const { CSVLoader } = require("langchain/document_loaders/fs/csv");const { PDFLoader } = require("langchain/document_loaders/fs/pdf");const { PPTXLoader } = require("langchain/document_loaders/fs/pptx");const { WebpageLoader } = require("./loaders/web.loader"); // 飞码AI自主开发 可过滤无用标签// const { CheerioWebBaseLoader } = require("langchain/document_loaders/web/cheerio"); // SSR网页 更加轻量// const { PuppeteerWebBaseLoader } = require("langchain/document_loaders/web/puppeteer"); // SPA网页 更加通用 不够精准会被爬虫拦截// 分割器const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");class IndexesService {    Document = Parse.Object.extend("Document")    loaderMap = {        "docx":DocxLoader,        "pdf":PDFLoader,        "csv":CSVLoader,        "pptx":PPTXLoader,        "webpage":WebpageLoader,        // "web":PuppeteerWebBaseLoader    }    async split(docs,options){        let splitter = new RecursiveCharacterTextSplitter({ // RecursiveCharacterTextSplitter 递归字符文本分割器(按语义相关分割:会通过不同的符号递归地分割文档-从“”开始,然后是“”,再然后是“ ”)            chunkSize: options?.chunkSize || 500, // 控制最终文档的最大大小(以字符数为单位)。 500接近一个自然段            chunkOverlap: options?.chunkOverlap || 100, // 指定文档之间应该有多少重叠。这通常有助于确保文本不会被奇怪地分割。        });        let docOutput = await splitter.splitDocuments(docs);        return docOutput    }    async load(filelink,extend,md5File){            // md5存在,则加载            let Parse = global.Parse;            if(md5File&&Parse){                let query = new Parse.Query("Document");                query.equalTo("md5",md5File);                let document = await query.first();                if(document?.id){                    return [{                        metadata:document.get("metadata"),                        pageContent:document.get("pageContent")                    }]                }            }            // 无结果则重新获取            extend = extend || filelink?.split(".")?.[filelink?.split(".")?.length-1]            let LoaderClass = this.loaderMap[extend]            let isHttp = (filelink?.indexOf("http")>-1) || false                        let loaderPath = ""            let hasDownloaded = false            let md5            if(isHttp){                 // 网址且非已知文件类                if(!LoaderClass){ // 纯网页地址                    LoaderClass = WebpageLoader                     loaderPath = filelink                }else{ // 网址且已知文件类类型 .docs .pptx                    let tempFilePath                    try{                        let res = await this.download(filelink)                        tempFilePath = res.tempFilePath                        md5 = res.md5                    }catch(err1){                        console.log("文件下载失败",filelink,err1)                        return                    }                    hasDownloaded = true                    loaderPath = tempFilePath                }            }else{                 loaderPath = path.join(__dirname,filelink)                if(!fs.existsSync(loaderPath)){                    console.log("文件不存在:",loaderPath)                    return                }                // let fileBuffer = this.readFileBuffer(loaderPath)                // try{                //     md5 = this.calcBufferMd5(fileBuffer)                // }catch(errmd5){}            }            let loader = new LoaderClass(                loaderPath            );            if(!loader){                console.log("仅支持:",Object.keys(loaderMap))                return            }            let docs = await loader.load();            if(hasDownloaded){                if(fs.existsSync(loaderPath)){fs.rmSync(loaderPath)}            }            // 根据md5存储加载结果            if(Parse){                let document = new this.Document();                let doc = docs?.[0]                document.set("type","entire")                document.set("md5",md5File||md5)                document.set("pageContent",doc?.pageContent)                document.set("metadata",doc?.metadata)                try{                    document.save()                }catch(errsave){}            }            return docs    }    /**     *      * @param {*} url      * @returns {{md5:string,tempFilePath:string}}     */    async download(url){        let fname = url.split("/")?.[url.split("/")?.length-1]        let tempFilePath = path.join(__dirname,"temp",fname)        if(!fs.existsSync(path.join(__dirname,"temp"))) fs.mkdirSync(path.join(__dirname,"temp"))        return new Promise((resolve)=>{            download(url).then(async fileBuffer=>{                fs.writeFileSync(tempFilePath, fileBuffer);                let md5                try{                    md5 = await this.calcBufferMd5(fileBuffer)                }catch(errmd5){                    console.error(errmd5)                }                console.log("Url File Md5:", md5)                resolve({                    tempFilePath,                    md5                })            }).catch(err=>{                console.error(err)                resolve(null)            })        })    }    /**     * 计算Buffer的MD5值     * @param {Buffer} buffer - 需要计算MD5的Buffer     * @returns {string} - MD5哈希值     */    calcBufferMd5(buffer) {        const hash = crypto.createHash('md5');        hash.update(buffer);        return hash.digest('hex');    }    /**     * 异步读取文件内容为Buffer     * @param {string} filePath - 文件路径     * @returns {Promise<Buffer>} - 返回一个Promise,解析为文件的Buffer     */    readFileBuffer(filePath) {        return new Promise((resolve, reject) => {            fs.readFile(filePath, (err, data) => {                if (err) {                    reject(err);                } else {                    resolve(data);                }            });        });    }}module.exports.IndexesService = IndexesService
 |