Sk4467 commited on
Commit
e721350
·
verified ·
1 Parent(s): af077f4

Update file_processing.py

Browse files
Files changed (1) hide show
  1. file_processing.py +16 -20
file_processing.py CHANGED
@@ -8,6 +8,7 @@ from dotenv import load_dotenv
8
  # load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env')
9
  openai_api_key = os.environ.get('OPENAI_API_KEY')
10
  from langchain.document_loaders import TextLoader, PDFMinerLoader, UnstructuredWordDocumentLoader, CSVLoader
 
11
 
12
  # def load_documents(file_path):
13
  # if file_path.endswith('.txt'):
@@ -32,28 +33,23 @@ import tempfile
32
  from langchain.docstore.document import Document
33
 
34
  def read_pdf(file_path: str) -> str:
35
- # Open the PDF with fitz
36
- doc = fitz.open(file_path)
37
- text = ""
38
- for page in doc:
39
- text += page.get_text()
40
-
41
  return text
42
 
43
  def read_docx(file_path: str) -> str:
44
- doc = docx.Document(file_path)
45
- fullText = []
46
- for para in doc.paragraphs:
47
- fullText.append(para.text)
48
- return '\n'.join(fullText)
49
 
50
  def read_csv(file_path: str) -> str:
51
- df = pd.read_csv(file_path)
52
- return df.to_string()
53
-
54
  def read_txt(file_path: str) -> str:
55
- with open(file_path, 'r', encoding='utf-8') as file:
56
- return file.read()
 
57
 
58
  async def load_documents(file: UploadFile) -> List[Document]:
59
  temp_file_path = f"temp_{file.filename}"
@@ -82,15 +78,15 @@ async def load_documents(file: UploadFile) -> List[Document]:
82
  if os.path.exists(temp_file_path):
83
  os.remove(temp_file_path) # Clean up the temporary file
84
 
85
- metadata = {'source': file.filename}
86
- document = Document(page_content=content, metadata=metadata)
87
- return [document]
88
 
89
 
90
 
91
  from langchain.text_splitter import CharacterTextSplitter
92
 
93
- def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
94
  text_splitter = CharacterTextSplitter(
95
  chunk_size=chunk_size,
96
  chunk_overlap=chunk_overlap
 
8
  # load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env')
9
  openai_api_key = os.environ.get('OPENAI_API_KEY')
10
  from langchain.document_loaders import TextLoader, PDFMinerLoader, UnstructuredWordDocumentLoader, CSVLoader
11
+ from langchain_community.document_loaders import PyMuPDFLoader,TextLoader,CSVLoader,Docx2txtLoader
12
 
13
  # def load_documents(file_path):
14
  # if file_path.endswith('.txt'):
 
33
  from langchain.docstore.document import Document
34
 
35
  def read_pdf(file_path: str) -> str:
36
+ loader= PyMuPDFLoader(file_path)
37
+ text = loader.load()
 
 
 
 
38
  return text
39
 
40
  def read_docx(file_path: str) -> str:
41
+ loader = Docx2txtLoader(file_path)
42
+ text = loader.load()
43
+ return text
 
 
44
 
45
  def read_csv(file_path: str) -> str:
46
+ loader = CSVLoader(file_path)
47
+ data = loader.load()
48
+ return data
49
  def read_txt(file_path: str) -> str:
50
+ loader = TextLoader(file_path)
51
+ text = loader.load()
52
+ return text
53
 
54
  async def load_documents(file: UploadFile) -> List[Document]:
55
  temp_file_path = f"temp_{file.filename}"
 
78
  if os.path.exists(temp_file_path):
79
  os.remove(temp_file_path) # Clean up the temporary file
80
 
81
+ # metadata = {'source': file.filename}
82
+ # document = Document(page_content=content, metadata=metadata)
83
+ return content
84
 
85
 
86
 
87
  from langchain.text_splitter import CharacterTextSplitter
88
 
89
+ def chunk_documents(documents, chunk_size, chunk_overlap):
90
  text_splitter = CharacterTextSplitter(
91
  chunk_size=chunk_size,
92
  chunk_overlap=chunk_overlap