File size: 3,217 Bytes
c2df9c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import { NextApiRequest, NextApiResponse } from 'next';
import fetch from 'node-fetch';
import { JSDOM } from 'jsdom';
// @ts-ignore
import pdfParse from 'pdf-parse';
import puppeteer from 'puppeteer';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { MemoryVectorStore } from 'langchain/vectorstores/memory';
import { HuggingFaceTransformersEmbeddings } from "langchain/embeddings/hf_transformers";

export const config = {
    api: {
      bodyParser: {
        sizeLimit: '1mb',
      },
    },
};

const DEFAULT_CHUNK_SIZE = 1000;
const VECTOR_STORE_SIZE = 10;
const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: DEFAULT_CHUNK_SIZE });

async function extractTextFromPDF(buffer: Buffer): Promise<string> {
  const data = await pdfParse(buffer);
  return data.text;
}

const model = new HuggingFaceTransformersEmbeddings({
    modelName: "Xenova/all-MiniLM-L6-v2",
});

const urlRegex = /(https?:\/\/[^\s]+)/g;

const handleContentText = async (targetUrl: string) => {
  const response = await fetch(targetUrl);
  const contentType = response.headers.get('content-type') || '';
  let content;
  if (contentType.includes('application/pdf')) {
    const buffer = await response.arrayBuffer();
    content = await extractTextFromPDF(buffer as any);
  } else if (contentType.includes('text/html')) {
    const html = await response.text();
    const dom = new JSDOM(html);
    const scripts = dom.window.document.querySelectorAll('script, style');
    scripts.forEach(element => element.remove());
    content = dom.window.document.body.textContent || '';

    if (!content.trim()) {
      const browser = await puppeteer.launch();
      const page = await browser.newPage();
      await page.goto(targetUrl);
      content = await page.evaluate(() => document.body.innerText);
      await browser.close();
    }
  } else {
    content = await response.text();
  }
  return content;
}

export default async function handler(req: NextApiRequest, res: NextApiResponse) {
  const prompt = req.body.prompt as string;
  const urls = prompt.match(urlRegex);
  const targetUrl = urls ? urls[0] : null;
  const promptWithoutUrl = urls ? prompt.replace(urlRegex, '').trim() : prompt;

  if (!targetUrl) {
    return `Couldn't find url, here is the ${prompt}`;
  }
  
  try {
    const content: string = await handleContentText(targetUrl)
    if (!content) {
      return `Couldn't find ${targetUrl}, here is the prompt: ${promptWithoutUrl}`;
    }
    
    const documents = await textSplitter.createDocuments([content]);

    const vectorStore = await MemoryVectorStore.fromTexts(
      // @ts-ignore
      [...documents.map(doc => doc.pageContent)],
      // @ts-ignore
      [...documents.map((v, k) => k)],
      model
    )
    const queryResult = await vectorStore.similaritySearch(promptWithoutUrl, VECTOR_STORE_SIZE);
    return res.status(200).send(
      `Here is the context: ${JSON.stringify(queryResult.map(result => result.pageContent))} from using the prompt to lookup relevant information. Here is the prompt: ${promptWithoutUrl}`);
  } catch (error) {
    console.error(error);
    // @ts-ignore
    return res.status(500).json({ error: error.message });
  }
}