Spaces:
Sleeping
Sleeping
Update file_processing.py
Browse files- file_processing.py +16 -20
file_processing.py
CHANGED
@@ -8,6 +8,7 @@ from dotenv import load_dotenv
|
|
8 |
# load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env')
|
9 |
openai_api_key = os.environ.get('OPENAI_API_KEY')
|
10 |
from langchain.document_loaders import TextLoader, PDFMinerLoader, UnstructuredWordDocumentLoader, CSVLoader
|
|
|
11 |
|
12 |
# def load_documents(file_path):
|
13 |
# if file_path.endswith('.txt'):
|
@@ -32,28 +33,23 @@ import tempfile
|
|
32 |
from langchain.docstore.document import Document
|
33 |
|
34 |
def read_pdf(file_path: str) -> str:
|
35 |
-
|
36 |
-
|
37 |
-
text = ""
|
38 |
-
for page in doc:
|
39 |
-
text += page.get_text()
|
40 |
-
|
41 |
return text
|
42 |
|
43 |
def read_docx(file_path: str) -> str:
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
fullText.append(para.text)
|
48 |
-
return '\n'.join(fullText)
|
49 |
|
50 |
def read_csv(file_path: str) -> str:
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
def read_txt(file_path: str) -> str:
|
55 |
-
|
56 |
-
|
|
|
57 |
|
58 |
async def load_documents(file: UploadFile) -> List[Document]:
|
59 |
temp_file_path = f"temp_{file.filename}"
|
@@ -82,15 +78,15 @@ async def load_documents(file: UploadFile) -> List[Document]:
|
|
82 |
if os.path.exists(temp_file_path):
|
83 |
os.remove(temp_file_path) # Clean up the temporary file
|
84 |
|
85 |
-
metadata = {'source': file.filename}
|
86 |
-
document = Document(page_content=content, metadata=metadata)
|
87 |
-
return
|
88 |
|
89 |
|
90 |
|
91 |
from langchain.text_splitter import CharacterTextSplitter
|
92 |
|
93 |
-
def chunk_documents(documents, chunk_size
|
94 |
text_splitter = CharacterTextSplitter(
|
95 |
chunk_size=chunk_size,
|
96 |
chunk_overlap=chunk_overlap
|
|
|
8 |
# load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env')
|
9 |
openai_api_key = os.environ.get('OPENAI_API_KEY')
|
10 |
from langchain.document_loaders import TextLoader, PDFMinerLoader, UnstructuredWordDocumentLoader, CSVLoader
|
11 |
+
from langchain_community.document_loaders import PyMuPDFLoader,TextLoader,CSVLoader,Docx2txtLoader
|
12 |
|
13 |
# def load_documents(file_path):
|
14 |
# if file_path.endswith('.txt'):
|
|
|
33 |
from langchain.docstore.document import Document
|
34 |
|
35 |
def read_pdf(file_path: str) -> str:
|
36 |
+
loader= PyMuPDFLoader(file_path)
|
37 |
+
text = loader.load()
|
|
|
|
|
|
|
|
|
38 |
return text
|
39 |
|
40 |
def read_docx(file_path: str) -> str:
|
41 |
+
loader = Docx2txtLoader(file_path)
|
42 |
+
text = loader.load()
|
43 |
+
return text
|
|
|
|
|
44 |
|
45 |
def read_csv(file_path: str) -> str:
|
46 |
+
loader = CSVLoader(file_path)
|
47 |
+
data = loader.load()
|
48 |
+
return data
|
49 |
def read_txt(file_path: str) -> str:
|
50 |
+
loader = TextLoader(file_path)
|
51 |
+
text = loader.load()
|
52 |
+
return text
|
53 |
|
54 |
async def load_documents(file: UploadFile) -> List[Document]:
|
55 |
temp_file_path = f"temp_{file.filename}"
|
|
|
78 |
if os.path.exists(temp_file_path):
|
79 |
os.remove(temp_file_path) # Clean up the temporary file
|
80 |
|
81 |
+
# metadata = {'source': file.filename}
|
82 |
+
# document = Document(page_content=content, metadata=metadata)
|
83 |
+
return content
|
84 |
|
85 |
|
86 |
|
87 |
from langchain.text_splitter import CharacterTextSplitter
|
88 |
|
89 |
+
def chunk_documents(documents, chunk_size, chunk_overlap):
|
90 |
text_splitter = CharacterTextSplitter(
|
91 |
chunk_size=chunk_size,
|
92 |
chunk_overlap=chunk_overlap
|