Spaces:

Sk4467
/

fastapiapp

Sleeping

App Files Files Community

Sk4467 commited on Mar 17, 2024

Commit

e721350

verified ·

1 Parent(s): af077f4

Update file_processing.py

Browse files

Files changed (1) hide show

file_processing.py +16 -20

file_processing.py CHANGED Viewed

@@ -8,6 +8,7 @@ from dotenv import load_dotenv
 # load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env')
 openai_api_key = os.environ.get('OPENAI_API_KEY')
 from langchain.document_loaders import TextLoader, PDFMinerLoader, UnstructuredWordDocumentLoader, CSVLoader
 # def load_documents(file_path):
 #     if file_path.endswith('.txt'):
@@ -32,28 +33,23 @@ import tempfile
 from langchain.docstore.document import Document
 def read_pdf(file_path: str) -> str:
-    # Open the PDF with fitz
-    doc = fitz.open(file_path)
-    text = ""
-    for page in doc:
-        text += page.get_text()
     return text
 def read_docx(file_path: str) -> str:
-    doc = docx.Document(file_path)
-    fullText = []
-    for para in doc.paragraphs:
-        fullText.append(para.text)
-    return '\n'.join(fullText)
 def read_csv(file_path: str) -> str:
-    df = pd.read_csv(file_path)
-    return df.to_string()
 def read_txt(file_path: str) -> str:
-    with open(file_path, 'r', encoding='utf-8') as file:
-        return file.read()
 async def load_documents(file: UploadFile) -> List[Document]:
     temp_file_path = f"temp_{file.filename}"
@@ -82,15 +78,15 @@ async def load_documents(file: UploadFile) -> List[Document]:
         if os.path.exists(temp_file_path):
             os.remove(temp_file_path)  # Clean up the temporary file
-    metadata = {'source': file.filename}
-    document = Document(page_content=content, metadata=metadata)
-    return [document]
 from langchain.text_splitter import CharacterTextSplitter
-def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
     text_splitter = CharacterTextSplitter(
         chunk_size=chunk_size,
         chunk_overlap=chunk_overlap

 # load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env')
 openai_api_key = os.environ.get('OPENAI_API_KEY')
 from langchain.document_loaders import TextLoader, PDFMinerLoader, UnstructuredWordDocumentLoader, CSVLoader
+from langchain_community.document_loaders import PyMuPDFLoader,TextLoader,CSVLoader,Docx2txtLoader
 # def load_documents(file_path):
 #     if file_path.endswith('.txt'):
 from langchain.docstore.document import Document
 def read_pdf(file_path: str) -> str:
+    loader= PyMuPDFLoader(file_path)
+    text = loader.load()
     return text
 def read_docx(file_path: str) -> str:
+    loader = Docx2txtLoader(file_path)
+    text = loader.load()
+    return text
 def read_csv(file_path: str) -> str:
+    loader = CSVLoader(file_path)
+    data = loader.load()
+    return data
 def read_txt(file_path: str) -> str:
+    loader = TextLoader(file_path)
+    text = loader.load()
+    return text
 async def load_documents(file: UploadFile) -> List[Document]:
     temp_file_path = f"temp_{file.filename}"
         if os.path.exists(temp_file_path):
             os.remove(temp_file_path)  # Clean up the temporary file
+    # metadata = {'source': file.filename}
+    # document = Document(page_content=content, metadata=metadata)
+    return content
 from langchain.text_splitter import CharacterTextSplitter
+def chunk_documents(documents, chunk_size, chunk_overlap):
     text_splitter = CharacterTextSplitter(
         chunk_size=chunk_size,
         chunk_overlap=chunk_overlap