| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180 | 
							
- const path = require("path");
 
- const fs = require("fs");
 
- const crypto = require("crypto");
 
- // 网络下载
 
- const download = require("download");
 
- // 加载器
 
- const { DocxLoader } = require("langchain/document_loaders/fs/docx");
 
- const { CSVLoader } = require("langchain/document_loaders/fs/csv");
 
- const { PDFLoader } = require("langchain/document_loaders/fs/pdf");
 
- const { PPTXLoader } = require("langchain/document_loaders/fs/pptx");
 
- const { WebpageLoader } = require("./loaders/web.loader"); // 飞码AI自主开发 可过滤无用标签
 
- // const { CheerioWebBaseLoader } = require("langchain/document_loaders/web/cheerio"); // SSR网页 更加轻量
 
- // const { PuppeteerWebBaseLoader } = require("langchain/document_loaders/web/puppeteer"); // SPA网页 更加通用 不够精准会被爬虫拦截
 
- // 分割器
 
- const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 
- class IndexesService {
 
-     Document = Parse.Object.extend("Document")
 
-     loaderMap = {
 
-         "docx":DocxLoader,
 
-         "pdf":PDFLoader,
 
-         "csv":CSVLoader,
 
-         "pptx":PPTXLoader,
 
-         "webpage":WebpageLoader,
 
-         // "web":PuppeteerWebBaseLoader
 
-     }
 
-     async split(docs,options){
 
-         let splitter = new RecursiveCharacterTextSplitter({ // RecursiveCharacterTextSplitter 递归字符文本分割器(按语义相关分割:会通过不同的符号递归地分割文档-从“”开始,然后是“”,再然后是“ ”)
 
-             chunkSize: options?.chunkSize || 500, // 控制最终文档的最大大小(以字符数为单位)。 500接近一个自然段
 
-             chunkOverlap: options?.chunkOverlap || 100, // 指定文档之间应该有多少重叠。这通常有助于确保文本不会被奇怪地分割。
 
-         });
 
-         let docOutput = await splitter.splitDocuments(docs);
 
-         return docOutput
 
-     }
 
-     async load(filelink,extend,md5File){
 
-             // md5存在,则加载
 
-             let Parse = global.Parse;
 
-             if(md5File&&Parse){
 
-                 let query = new Parse.Query("Document");
 
-                 query.equalTo("md5",md5File);
 
-                 let document = await query.first();
 
-                 if(document?.id){
 
-                     return [{
 
-                         metadata:document.get("metadata"),
 
-                         pageContent:document.get("pageContent")
 
-                     }]
 
-                 }
 
-             }
 
-             // 无结果则重新获取
 
-             extend = extend || filelink?.split(".")?.[filelink?.split(".")?.length-1]
 
-             let LoaderClass = this.loaderMap[extend]
 
-             let isHttp = (filelink?.indexOf("http")>-1) || false
 
-             
 
-             let loaderPath = ""
 
-             let hasDownloaded = false
 
-             let md5
 
-             if(isHttp){ 
 
-                 // 网址且非已知文件类
 
-                 if(!LoaderClass){ // 纯网页地址
 
-                     LoaderClass = WebpageLoader 
 
-                     loaderPath = filelink
 
-                 }else{ // 网址且已知文件类类型 .docs .pptx
 
-                     let tempFilePath
 
-                     try{
 
-                         let res = await this.download(filelink)
 
-                         tempFilePath = res.tempFilePath
 
-                         md5 = res.md5
 
-                     }catch(err1){
 
-                         console.log("文件下载失败",filelink,err1)
 
-                         return
 
-                     }
 
-                     hasDownloaded = true
 
-                     loaderPath = tempFilePath
 
-                 }
 
-             }else{ 
 
-                 loaderPath = path.join(__dirname,filelink)
 
-                 if(!fs.existsSync(loaderPath)){
 
-                     console.log("文件不存在:",loaderPath)
 
-                     return
 
-                 }
 
-                 // let fileBuffer = this.readFileBuffer(loaderPath)
 
-                 // try{
 
-                 //     md5 = this.calcBufferMd5(fileBuffer)
 
-                 // }catch(errmd5){}
 
-             }
 
-             let loader = new LoaderClass(
 
-                 loaderPath
 
-             );
 
-             if(!loader){
 
-                 console.log("仅支持:",Object.keys(loaderMap))
 
-                 return
 
-             }
 
-             let docs = await loader.load();
 
-             if(hasDownloaded){
 
-                 if(fs.existsSync(loaderPath)){fs.rmSync(loaderPath)}
 
-             }
 
-             // 根据md5存储加载结果
 
-             if(Parse){
 
-                 let document = new this.Document();
 
-                 let doc = docs?.[0]
 
-                 document.set("type","entire")
 
-                 document.set("md5",md5File||md5)
 
-                 document.set("pageContent",doc?.pageContent)
 
-                 document.set("metadata",doc?.metadata)
 
-                 try{
 
-                     document.save()
 
-                 }catch(errsave){}
 
-             }
 
-             return docs
 
-     }
 
-     /**
 
-      * 
 
-      * @param {*} url 
 
-      * @returns {{md5:string,tempFilePath:string}}
 
-      */
 
-     async download(url){
 
-         let fname = url.split("/")?.[url.split("/")?.length-1]
 
-         let tempFilePath = path.join(__dirname,"temp",fname)
 
-         if(!fs.existsSync(path.join(__dirname,"temp"))) fs.mkdirSync(path.join(__dirname,"temp"))
 
-         return new Promise((resolve)=>{
 
-             download(url).then(async fileBuffer=>{
 
-                 fs.writeFileSync(tempFilePath, fileBuffer);
 
-                 let md5
 
-                 try{
 
-                     md5 = await this.calcBufferMd5(fileBuffer)
 
-                 }catch(errmd5){
 
-                     console.error(errmd5)
 
-                 }
 
-                 console.log("Url File Md5:", md5)
 
-                 resolve({
 
-                     tempFilePath,
 
-                     md5
 
-                 })
 
-             }).catch(err=>{
 
-                 console.error(err)
 
-                 resolve(null)
 
-             })
 
-         })
 
-     }
 
-     /**
 
-      * 计算Buffer的MD5值
 
-      * @param {Buffer} buffer - 需要计算MD5的Buffer
 
-      * @returns {string} - MD5哈希值
 
-      */
 
-     calcBufferMd5(buffer) {
 
-         const hash = crypto.createHash('md5');
 
-         hash.update(buffer);
 
-         return hash.digest('hex');
 
-     }
 
-     /**
 
-      * 异步读取文件内容为Buffer
 
-      * @param {string} filePath - 文件路径
 
-      * @returns {Promise<Buffer>} - 返回一个Promise,解析为文件的Buffer
 
-      */
 
-     readFileBuffer(filePath) {
 
-         return new Promise((resolve, reject) => {
 
-             fs.readFile(filePath, (err, data) => {
 
-                 if (err) {
 
-                     reject(err);
 
-                 } else {
 
-                     resolve(data);
 
-                 }
 
-             });
 
-         });
 
-     }
 
- }
 
- module.exports.IndexesService = IndexesService
 
 
  |