Spaces:
Sleeping
Sleeping
File size: 4,499 Bytes
c2df9c2 a98334b c2df9c2 a98334b c2df9c2 f4e05b5 a98334b f4e05b5 c2df9c2 a98334b c2df9c2 a98334b c2df9c2 a98334b c2df9c2 a98334b 78354fc 1c1c1be c2df9c2 1c1c1be c2df9c2 1c1c1be c2df9c2 1c1c1be 78354fc 5f5314e 78354fc 45dbf34 78354fc 1c1c1be f16de12 1c1c1be f16de12 1c1c1be 45dbf34 c2df9c2 45dbf34 c2df9c2 45dbf34 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import { NextApiRequest, NextApiResponse } from 'next';
import fetch, { RequestInfo } from 'node-fetch';
import { JSDOM } from 'jsdom';
// @ts-ignore
import pdfParse from 'pdf-parse';
import puppeteer from 'puppeteer';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { MemoryVectorStore } from 'langchain/vectorstores/memory';
import { HuggingFaceTransformersEmbeddings } from "langchain/embeddings/hf_transformers";
import { createSearchApi } from '../../../app/tools/search'
export const config = {
api: {
bodyParser: {
sizeLimit: '1mb',
},
},
};
const DEFAULT_CHUNK_SIZE = 1000;
const VECTOR_STORE_SIZE = 10;
const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: DEFAULT_CHUNK_SIZE });
async function extractTextFromPDF(buffer: Buffer): Promise<string> {
const data = await pdfParse(buffer);
return data.text;
}
const model = new HuggingFaceTransformersEmbeddings({
modelName: "Xenova/all-MiniLM-L6-v2",
});
const urlRegex = /(https?:\/\/[^\s]+)/g;
const [serpApi] =
createSearchApi({
apiKey: process.env.SERP_API_KEY || "",
});
const handleContentText = async (targetUrl: string) => {
const response = await fetch(targetUrl);
const status = response.status;
const contentType = response.headers.get('content-type') || '';
let content;
if (status >= 400) {
// If status is 400 or greater, try using puppeteer
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(targetUrl, { waitUntil: 'networkidle0' }); // waits for the network to be idle before considering the navigation to be finished.
content = await page.evaluate(() => document.body.innerText);
await browser.close();
return content;
} else if (contentType.includes('application/pdf')) {
const buffer = await response.arrayBuffer();
content = await extractTextFromPDF(buffer as any);
} else if (contentType.includes('text/html')) {
const html = await response.text();
const dom = new JSDOM(html);
const scripts = dom.window.document.querySelectorAll('script, style');
scripts.forEach(element => element.remove());
content = dom.window.document.body.textContent || '';
} else {
content = await response.text();
}
return content.trim();
}
const surferEmbedApi = async ({ input }: any) => {
const urls = input.match(urlRegex);
const targetUrl = urls ? urls[0] : null;
const promptWithoutUrl = urls ? input.replace(urlRegex, '').trim() : input;
const content: string = await handleContentText(targetUrl)
if (!content) {
return `Couldn't find ${targetUrl}, here is the prompt: ${promptWithoutUrl}`;
}
const documents = await textSplitter.createDocuments([content]);
const vectorStore = await MemoryVectorStore.fromTexts(
// @ts-ignore
[...documents.map(doc => doc.pageContent)],
// @ts-ignore
[...documents.map((v, k) => k)],
model
)
const queryResult = await vectorStore.similaritySearch(promptWithoutUrl, VECTOR_STORE_SIZE);
return `Here is the context: ${JSON.stringify(queryResult.map(result => result.pageContent))} from using the prompt to lookup relevant information. Here is the prompt: ${promptWithoutUrl}`;
}
const serpEmbedApi = async ({ input }: any) => {
const content: string = await serpApi({input})
const documents = await textSplitter.createDocuments([content]);
const vectorStore = await MemoryVectorStore.fromTexts(
// @ts-ignore
[...documents.map(doc => doc.pageContent)],
// @ts-ignore
[...documents.map((v, k) => k)],
model
)
const queryResult = await vectorStore.similaritySearch(input, VECTOR_STORE_SIZE);
return queryResult;
}
const handlers: any = {
searchApi: serpEmbedApi,
surferEmbedApi: surferEmbedApi
};
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
const args = req.body.args as string;
const functionName = req.body.name as string;
const functionInput = JSON.parse(args);
const functionHandler = handlers[functionName];
if (!functionHandler) {
console.error(`Function "${functionName}" is not supported.`);
return res.status(500).json({ error: `Function "${functionName}" is not supported.` });
}
try {
const result = await functionHandler(functionInput);
return res.status(200).send(result);
} catch (error) {
console.error(error);
// @ts-ignore
return res.status(500).json({ error: error.message });
}
}
|