File size: 3,753 Bytes
d0fbfa7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from langchain.document_loaders import PyPDFLoader, PDFMinerLoader, DirectoryLoader
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from os.path import join
import os
from dotenv import load_dotenv
load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env')
openai_api_key = os.environ.get('OPENAI_API_KEY')
from langchain.document_loaders import TextLoader, PDFMinerLoader, UnstructuredWordDocumentLoader, CSVLoader

# def load_documents(file_path):
#     if file_path.endswith('.txt'):
#         loader = TextLoader(file_path)
#     elif file_path.endswith('.pdf'):
#         loader = PyPDFLoader(file_path)
#     elif file_path.endswith('.doc') or file_path.endswith('.docx'):
#         loader = UnstructuredWordDocumentLoader(file_path)
#     elif file_path.endswith('.csv'):
#         loader = CSVLoader(file_path)
#     else:
#         raise ValueError(f"Unsupported file format: {file_path}")

#     documents = loader.load()
#     return documents
from fastapi import UploadFile
from typing import List
import fitz  # PyMuPDF
import pandas as pd
import docx
from langchain.docstore.document import Document
def read_pdf(file_path: str) -> str:
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def read_docx(file_path: str) -> str:
    doc = docx.Document(file_path)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

def read_csv(file_path: str) -> str:
    df = pd.read_csv(file_path)
    return df.to_string()

def read_txt(file_path: str) -> str:
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

async def load_documents(file: UploadFile)->List[Document]:
    temp_file_path = f"temp_{file.filename}"
    try:
        # Save the uploaded file to a temporary file
        with open(temp_file_path, "wb") as temp_file:
            temp_file.write(await file.read())

        content = ""
        if file.filename.endswith('.pdf'):
            content = read_pdf(temp_file_path)
        elif file.filename.endswith('.docx'):
            content = read_docx(temp_file_path)
        elif file.filename.endswith('.csv'):
            content = read_csv(temp_file_path)
        elif file.filename.endswith('.txt'):
            content = read_txt(temp_file_path)
        else:
            raise ValueError("Unsupported file format")
    except Exception as e:
        # Handle general errors - log or adjust as necessary for your application
        print(f"Error processing document: {e}")
        content = "Error processing document."
    finally:
        # Cleanup: remove the temporary file
        if os.path.exists(temp_file_path):
            os.remove(temp_file_path)

    metadata = {'source': file.filename}
    document = Document(page_content=content, metadata=metadata)
    return [document]


from langchain.text_splitter import CharacterTextSplitter

def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = CharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunked_docs = text_splitter.split_documents(documents)
    return chunked_docs


from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

def create_embeddings(chunked_docs, collection_name):
    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
    vector_store = Chroma.from_documents(chunked_docs, embeddings, collection_name=collection_name)
    vector_store.persist()

    return vector_store