Spaces:
Runtime error
Runtime error
LOUIS SANNA
commited on
Commit
·
6d540dc
1
Parent(s):
7f45ab4
feat(summarization): add document summarization
Browse files- .gitignore +2 -1
- data/processed/Fulgoroidea2008-Mascareignes-AttiéBourgoinVeslotSoulier-summary.txt +1 -0
- load.py +12 -8
- summarize.py +47 -0
.gitignore
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
.env
|
|
|
|
1 |
+
.env
|
2 |
+
__pycache__/
|
data/processed/Fulgoroidea2008-Mascareignes-AttiéBourgoinVeslotSoulier-summary.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
This article examines the trophic relationships between planthoppers (Hemiptera: Fulgoromorpha) and their host plants on the Mascarene Islands. It was found that the endemic fauna remains essentially on endemic plants and exotic planthoppers have not shifted to them, remaining on exotic plants. Additionally, a positive correlation between endemic planthopper diversity and endemic plant diversity was observed. The article also discusses the disparities between Cixiidae from La Re´union and Mauritius, the host plant data of planthoppers, the biological and ecological data available for the planthoppers, the percentage of records on dicots families for each family of Fulgoromorpha, the effects of alien plant invasions, the cost of polyphagy, and the conservation and restoration of the flora of Mauritius and Rodrigues.
|
load.py
CHANGED
@@ -18,18 +18,22 @@ def parse_documents(path):
|
|
18 |
documents = []
|
19 |
|
20 |
for file_path in pdf_files:
|
21 |
-
|
22 |
-
loader = UnstructuredFileLoader(file_path)
|
23 |
-
document = loader.load()
|
24 |
-
documents.extend(document)
|
25 |
-
print(f"File added: {file_path}")
|
26 |
-
|
27 |
-
except Exception as e:
|
28 |
-
print(f"An error occurred while processing the file {file_path}: {str(e)}")
|
29 |
|
30 |
return documents
|
31 |
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
def split(documents):
|
34 |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
|
35 |
return text_splitter.split_documents(documents)
|
|
|
18 |
documents = []
|
19 |
|
20 |
for file_path in pdf_files:
|
21 |
+
documents.extend(parse_document(file_path))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
return documents
|
24 |
|
25 |
|
26 |
+
def parse_document(file_path):
|
27 |
+
try:
|
28 |
+
loader = UnstructuredFileLoader(file_path)
|
29 |
+
document = loader.load()
|
30 |
+
print(f"File parsed: {file_path}")
|
31 |
+
return document
|
32 |
+
|
33 |
+
except Exception as e:
|
34 |
+
print(f"An error occurred while processing the file {file_path}: {str(e)}")
|
35 |
+
|
36 |
+
|
37 |
def split(documents):
|
38 |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
|
39 |
return text_splitter.split_documents(documents)
|
summarize.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
from langchain import OpenAI
|
3 |
+
from langchain.chains.summarize import load_summarize_chain
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from load import parse_document
|
6 |
+
|
7 |
+
load_dotenv()
|
8 |
+
|
9 |
+
DOCUMENT_PATH = "data/raw/cixiidae"
|
10 |
+
|
11 |
+
|
12 |
+
llm = OpenAI(temperature=0)
|
13 |
+
|
14 |
+
|
15 |
+
def summarize(raw_documents):
|
16 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
17 |
+
separators=["\n\n", "\n"], chunk_size=6000, chunk_overlap=300
|
18 |
+
)
|
19 |
+
docs = text_splitter.split_documents(raw_documents)
|
20 |
+
|
21 |
+
num_docs = len(docs)
|
22 |
+
num_tokens_first_doc = llm.get_num_tokens(docs[0].page_content)
|
23 |
+
print(
|
24 |
+
f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens"
|
25 |
+
)
|
26 |
+
|
27 |
+
summary_chain = load_summarize_chain(llm=llm, chain_type="map_reduce")
|
28 |
+
output = summary_chain.run(docs)
|
29 |
+
|
30 |
+
return output
|
31 |
+
|
32 |
+
|
33 |
+
def main():
|
34 |
+
raw_documents = parse_document(
|
35 |
+
"data/raw/cixiidae/Fulgoroidea2008-Mascareignes-AttiéBourgoinVeslotSoulier.pdf"
|
36 |
+
)
|
37 |
+
output = summarize(raw_documents)
|
38 |
+
print(output)
|
39 |
+
with open(
|
40 |
+
"data/processed/Fulgoroidea2008-Mascareignes-AttiéBourgoinVeslotSoulier-summary.txt",
|
41 |
+
"w",
|
42 |
+
) as f:
|
43 |
+
f.write(output)
|
44 |
+
|
45 |
+
|
46 |
+
if __name__ == "__main__":
|
47 |
+
main()
|