|
from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader |
|
from langchain_community.document_loaders import UnstructuredPowerPointLoader |
|
from langchain_cohere.llms import Cohere |
|
from langchain.chains.summarize import load_summarize_chain |
|
from pathlib import Path |
|
|
|
def summarize_files(method, files): |
|
|
|
llm = Cohere(temperature=0) |
|
summaries = [] |
|
|
|
for file in files: |
|
|
|
file_path = os.path.join(files, file) |
|
ext = Path(file_path).suffix.lower() |
|
if ext == '.pdf': |
|
loader = PyPDFLoader(file_path) |
|
elif ext == '.docx': |
|
loader = Docx2txtLoader(file_path) |
|
elif ext == '.pptx': |
|
loader = UnstructuredPowerPointLoader(file_path) |
|
else: |
|
raise ValueError(f"Unsupported file extension: {ext}") |
|
|
|
docs = loader.load_and_split() |
|
|
|
summarization_chain = load_summarize_chain(llm=llm, chain_type=method) |
|
summary = summarization_chain.run(docs) |
|
summaries.append(summary) |
|
|
|
return summaries |
|
|