Spaces:
Sleeping
Sleeping
import { NextApiRequest, NextApiResponse } from 'next'; | |
import fetch from 'node-fetch'; | |
import { JSDOM } from 'jsdom'; | |
// @ts-ignore | |
import pdfParse from 'pdf-parse'; | |
import puppeteer from 'puppeteer'; | |
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; | |
import { MemoryVectorStore } from 'langchain/vectorstores/memory'; | |
import { HuggingFaceTransformersEmbeddings } from "langchain/embeddings/hf_transformers"; | |
import { createSerpApi } from '../../../app/tools/serp-api' | |
export const config = { | |
api: { | |
bodyParser: { | |
sizeLimit: '1mb', | |
}, | |
}, | |
}; | |
const DEFAULT_CHUNK_SIZE = 1000; | |
const VECTOR_STORE_SIZE = 10; | |
const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: DEFAULT_CHUNK_SIZE }); | |
async function extractTextFromPDF(buffer: Buffer): Promise<string> { | |
const data = await pdfParse(buffer); | |
return data.text; | |
} | |
const model = new HuggingFaceTransformersEmbeddings({ | |
modelName: "Xenova/all-MiniLM-L6-v2", | |
}); | |
const urlRegex = /(https?:\/\/[^\s]+)/g; | |
const [serpApi] = | |
createSerpApi({ | |
apiKey: process.env.SERP_API_KEY || "", | |
}); | |
const handleContentText = async (targetUrl: string) => { | |
const response = await fetch(targetUrl); | |
const contentType = response.headers.get('content-type') || ''; | |
let content; | |
if (contentType.includes('application/pdf')) { | |
const buffer = await response.arrayBuffer(); | |
content = await extractTextFromPDF(buffer as any); | |
} else if (contentType.includes('text/html')) { | |
const html = await response.text(); | |
const dom = new JSDOM(html); | |
const scripts = dom.window.document.querySelectorAll('script, style'); | |
scripts.forEach(element => element.remove()); | |
content = dom.window.document.body.textContent || ''; | |
if (!content.trim()) { | |
const browser = await puppeteer.launch(); | |
const page = await browser.newPage(); | |
await page.goto(targetUrl); | |
content = await page.evaluate(() => document.body.innerText); | |
await browser.close(); | |
} | |
} else { | |
content = await response.text(); | |
} | |
return content; | |
} | |
const surferEmbedApi = async ({ input }: any) => { | |
const urls = input.match(urlRegex); | |
const targetUrl = urls ? urls[0] : null; | |
const promptWithoutUrl = urls ? input.replace(urlRegex, '').trim() : input; | |
const content: string = await handleContentText(targetUrl) | |
if (!content) { | |
return `Couldn't find ${targetUrl}, here is the prompt: ${promptWithoutUrl}`; | |
} | |
const documents = await textSplitter.createDocuments([content]); | |
const vectorStore = await MemoryVectorStore.fromTexts( | |
// @ts-ignore | |
[...documents.map(doc => doc.pageContent)], | |
// @ts-ignore | |
[...documents.map((v, k) => k)], | |
model | |
) | |
const queryResult = await vectorStore.similaritySearch(promptWithoutUrl, VECTOR_STORE_SIZE); | |
return `Here is the context: ${JSON.stringify(queryResult.map(result => result.pageContent))} from using the prompt to lookup relevant information. Here is the prompt: ${promptWithoutUrl}`; | |
} | |
const serpEmbedApi = async ({ input }: any) => { | |
const content: string = await serpApi({ input }) | |
const documents = await textSplitter.createDocuments([content]); | |
const vectorStore = await MemoryVectorStore.fromTexts( | |
// @ts-ignore | |
[...documents.map(doc => doc.pageContent)], | |
// @ts-ignore | |
[...documents.map((v, k) => k)], | |
model | |
) | |
const queryResult = await vectorStore.similaritySearch(input, VECTOR_STORE_SIZE); | |
return queryResult; | |
} | |
export default async function handler(req: NextApiRequest, res: NextApiResponse) { | |
const prompt = req.body.prompt as string; | |
const functionName = req.body.name as string; | |
try { | |
if (functionName === 'serpApi') { | |
const result = await serpEmbedApi({ input: prompt }); | |
return res.status(200).send(result); | |
} else { | |
const result = await surferEmbedApi({ input: prompt }) | |
return res.status(200).send(result); | |
} | |
} catch (error) { | |
console.error(error); | |
// @ts-ignore | |
return res.status(500).json({ error: error.message }); | |
} | |
} |