| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980 | 
							
- const {
 
-     SupportedTextSplitterLanguages,
 
-     RecursiveCharacterTextSplitter,
 
-     TokenTextSplitter
 
-   } = require("langchain/text_splitter");
 
- var mammoth = require("mammoth");
 
- const fs = require('fs');
 
- const pdf = require('pdf-parse');
 
- async function main(){
 
-  
 
-     // 加载器提取纯文本
 
-     let data = await pdfLoader("../data/pgvector.pdf")
 
-     let text = data.text;
 
-     console.log(text)
 
-     const splitter = new TokenTextSplitter({
 
-         encodingName: "gpt2",
 
-         chunkSize: 500,
 
-         chunkOverlap: 0,
 
-     });
 
-     const output = await splitter.createDocuments([text]);
 
-     console.log(output)
 
-   
 
-     // 文本提取文本块
 
-     let html = await docsLoader("../data/pgvector.docx")
 
-     const splitter = RecursiveCharacterTextSplitter.fromLanguage("html", {
 
-         chunkSize: 4096,
 
-         chunkOverlap: 20,
 
-       });
 
-     const output = await splitter.createDocuments([html]);
 
-     
 
-     console.log(output);
 
-     console.log(JSON.stringify(output[0]));
 
-         
 
- }
 
- main()
 
- async function docsLoader(path){
 
-     return new Promise(resolve=>{
 
-         mammoth.convertToHtml({path:  path})
 
-         .then(function(result){
 
-             var html = result.value; // The generated HTML
 
-             var messages = result.messages; // Any messages, such as warnings during conversion
 
-             console.log(html)
 
-             console.log(messages)
 
-             resolve(html)
 
-         })
 
-         .catch(function(error) {
 
-             console.error(error);
 
-         });
 
-     })
 
- }
 
- async function pdfLoader(path){
 
-     
 
-     let dataBuffer = fs.readFileSync(path);
 
-     return new Promise(resolve=>{
 
-         pdf(dataBuffer).then(function(data) {
 
-         
 
-             // number of pages
 
-             console.log(data.numpages);
 
-             // number of rendered pages
 
-             console.log(data.numrender);
 
-             // PDF info
 
-             console.log(data.info);
 
-             // PDF metadata
 
-             console.log(data.metadata); 
 
-             // PDF.js version
 
-             // check https://mozilla.github.io/pdf.js/getting_started/
 
-             console.log(data.version);
 
-             // PDF text
 
-             console.log(data.text); 
 
-             resolve(data)
 
-         });
 
-     })
 
- }
 
 
  |