File size: 2,135 Bytes
5e20c77
99a3f34
5e20c77
10330bc
 
 
99a3f34
 
 
 
 
 
 
 
 
 
 
 
5e20c77
 
10330bc
 
 
 
 
 
 
 
 
 
 
 
99a3f34
10330bc
 
 
 
5e20c77
99a3f34
10330bc
 
99a3f34
 
10330bc
99a3f34
 
 
10330bc
99a3f34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10330bc
99a3f34
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from chainlit.types import AskFileResponse
import click
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.vectorstores import Chroma


from src.config import Config
# import chainlit as cl
import logging
import openai
import os
from dotenv import load_dotenv

load_dotenv()




def process_file(file: AskFileResponse):
    import tempfile 

    if file.type == "text/plain":
        Loader = TextLoader
    elif file.type == "application/pdf":
        Loader = PyPDFDirectoryLoader

    with tempfile.NamedTemporaryFile() as tempfile:
        tempfile.write(file.content)
        loader = Loader(tempfile.name)
        documents = loader.load()
        # text_splitter = text_splitter()
        docs = Config.text_splitter.split_documents(documents)

        for i, doc in enumerate(docs):
            doc.metadata["source"] = f"source_{i}"
        return docs

def get_docSearch(file,cl):
    docs = process_file(file)

    logging.info("files loaded ")

    ## save data in user session 
    cl.user_session.set("docs",docs)

    logging.info("docs saved in active session")
    
    docsearch = Chroma.from_documents(docs, Config.embeddings)

    logging.info("embedding completed")

    return docsearch

def get_source(sources,all_sources,docs,cl):
    answer = []
    source_elements = []
    if sources:
        found_sources = []

        # Add the sources to the message
        for source in sources.split(","):
            source_name = source.strip().replace(".", "")
            # Get the index of the source
            try:
                index = all_sources.index(source_name)
            except ValueError:
                continue
            text = docs[index].page_content
            found_sources.append(source_name)
            # Create the text element referenced in the message
            source_elements.append(cl.Text(content=text, name=source_name))

        if found_sources:
            answer += f"\nSources: {', '.join(found_sources)}"
        else:
            answer += "\nNo sources found"
    return source_elements,answer