File size: 4,499 Bytes
c2df9c2
a98334b
c2df9c2
 
 
 
 
 
 
a98334b
c2df9c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4e05b5
a98334b
f4e05b5
 
 
c2df9c2
 
a98334b
c2df9c2
 
a98334b
 
 
 
 
 
 
 
 
 
c2df9c2
 
 
 
 
 
 
 
 
 
 
a98334b
c2df9c2
 
a98334b
78354fc
1c1c1be
c2df9c2
1c1c1be
c2df9c2
1c1c1be
 
 
c2df9c2
 
1c1c1be
 
 
 
 
 
 
 
 
 
 
 
 
78354fc
5f5314e
78354fc
 
 
 
 
 
 
 
 
 
 
45dbf34
 
 
 
78354fc
1c1c1be
f16de12
1c1c1be
f16de12
1c1c1be
45dbf34
 
 
 
 
 
 
c2df9c2
45dbf34
 
c2df9c2
 
 
 
 
45dbf34
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import { NextApiRequest, NextApiResponse } from 'next';
import fetch, { RequestInfo } from 'node-fetch';
import { JSDOM } from 'jsdom';
// @ts-ignore
import pdfParse from 'pdf-parse';
import puppeteer from 'puppeteer';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { MemoryVectorStore } from 'langchain/vectorstores/memory';
import { HuggingFaceTransformersEmbeddings } from "langchain/embeddings/hf_transformers";
import { createSearchApi } from '../../../app/tools/search'

export const config = {
    api: {
      bodyParser: {
        sizeLimit: '1mb',
      },
    },
};

const DEFAULT_CHUNK_SIZE = 1000;
const VECTOR_STORE_SIZE = 10;
const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: DEFAULT_CHUNK_SIZE });

async function extractTextFromPDF(buffer: Buffer): Promise<string> {
  const data = await pdfParse(buffer);
  return data.text;
}

const model = new HuggingFaceTransformersEmbeddings({
    modelName: "Xenova/all-MiniLM-L6-v2",
});

const urlRegex = /(https?:\/\/[^\s]+)/g;

const [serpApi] =
  createSearchApi({
    apiKey: process.env.SERP_API_KEY || "",
});

const handleContentText = async (targetUrl: string) => {
  const response = await fetch(targetUrl);
  const status = response.status;
  const contentType = response.headers.get('content-type') || '';
  let content;

  if (status >= 400) {
    // If status is 400 or greater, try using puppeteer
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto(targetUrl, { waitUntil: 'networkidle0' }); // waits for the network to be idle before considering the navigation to be finished.
    content = await page.evaluate(() => document.body.innerText);
    await browser.close();
    return content;
  } else if (contentType.includes('application/pdf')) {
    const buffer = await response.arrayBuffer();
    content = await extractTextFromPDF(buffer as any);
  } else if (contentType.includes('text/html')) {
    const html = await response.text();
    const dom = new JSDOM(html);
    const scripts = dom.window.document.querySelectorAll('script, style');
    scripts.forEach(element => element.remove());
    content = dom.window.document.body.textContent || '';
  } else {
    content = await response.text();
  }
  return content.trim();
}


const surferEmbedApi = async ({ input }: any) => {
  const urls = input.match(urlRegex);
  const targetUrl = urls ? urls[0] : null;
  const promptWithoutUrl = urls ? input.replace(urlRegex, '').trim() : input;

  const content: string = await handleContentText(targetUrl)
  if (!content) {
    return `Couldn't find ${targetUrl}, here is the prompt: ${promptWithoutUrl}`;
  }
  
  const documents = await textSplitter.createDocuments([content]);

  const vectorStore = await MemoryVectorStore.fromTexts(
    // @ts-ignore
    [...documents.map(doc => doc.pageContent)],
    // @ts-ignore
    [...documents.map((v, k) => k)],
    model
  )
  const queryResult = await vectorStore.similaritySearch(promptWithoutUrl, VECTOR_STORE_SIZE);
  return `Here is the context: ${JSON.stringify(queryResult.map(result => result.pageContent))} from using the prompt to lookup relevant information. Here is the prompt: ${promptWithoutUrl}`;
}

const serpEmbedApi = async ({ input }: any) => {
  const content: string = await serpApi({input})
  const documents = await textSplitter.createDocuments([content]);
  const vectorStore = await MemoryVectorStore.fromTexts(
    // @ts-ignore
    [...documents.map(doc => doc.pageContent)],
    // @ts-ignore
    [...documents.map((v, k) => k)],
    model
  )
  const queryResult = await vectorStore.similaritySearch(input, VECTOR_STORE_SIZE);
  return queryResult;
}
const handlers: any = {
  searchApi: serpEmbedApi,
  surferEmbedApi: surferEmbedApi
};

export default async function handler(req: NextApiRequest, res: NextApiResponse) {
  const args = req.body.args as string;
  const functionName = req.body.name as string;
  const functionInput = JSON.parse(args);
  
  const functionHandler = handlers[functionName];
  
  if (!functionHandler) {
    console.error(`Function "${functionName}" is not supported.`);
    return res.status(500).json({ error: `Function "${functionName}" is not supported.` });
  }
  
  try {
    const result = await functionHandler(functionInput);
    return res.status(200).send(result);
  } catch (error) {
    console.error(error);
    // @ts-ignore
    return res.status(500).json({ error: error.message });
  }
}