LOUIS SANNA commited on
Commit
6d540dc
·
1 Parent(s): 7f45ab4

feat(summarization): add document summarization

Browse files
.gitignore CHANGED
@@ -1 +1,2 @@
1
- .env
 
 
1
+ .env
2
+ __pycache__/
data/processed/Fulgoroidea2008-Mascareignes-AttiéBourgoinVeslotSoulier-summary.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ This article examines the trophic relationships between planthoppers (Hemiptera: Fulgoromorpha) and their host plants on the Mascarene Islands. It was found that the endemic fauna remains essentially on endemic plants and exotic planthoppers have not shifted to them, remaining on exotic plants. Additionally, a positive correlation between endemic planthopper diversity and endemic plant diversity was observed. The article also discusses the disparities between Cixiidae from La Re´union and Mauritius, the host plant data of planthoppers, the biological and ecological data available for the planthoppers, the percentage of records on dicots families for each family of Fulgoromorpha, the effects of alien plant invasions, the cost of polyphagy, and the conservation and restoration of the flora of Mauritius and Rodrigues.
load.py CHANGED
@@ -18,18 +18,22 @@ def parse_documents(path):
18
  documents = []
19
 
20
  for file_path in pdf_files:
21
- try:
22
- loader = UnstructuredFileLoader(file_path)
23
- document = loader.load()
24
- documents.extend(document)
25
- print(f"File added: {file_path}")
26
-
27
- except Exception as e:
28
- print(f"An error occurred while processing the file {file_path}: {str(e)}")
29
 
30
  return documents
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
33
  def split(documents):
34
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
35
  return text_splitter.split_documents(documents)
 
18
  documents = []
19
 
20
  for file_path in pdf_files:
21
+ documents.extend(parse_document(file_path))
 
 
 
 
 
 
 
22
 
23
  return documents
24
 
25
 
26
+ def parse_document(file_path):
27
+ try:
28
+ loader = UnstructuredFileLoader(file_path)
29
+ document = loader.load()
30
+ print(f"File parsed: {file_path}")
31
+ return document
32
+
33
+ except Exception as e:
34
+ print(f"An error occurred while processing the file {file_path}: {str(e)}")
35
+
36
+
37
  def split(documents):
38
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
39
  return text_splitter.split_documents(documents)
summarize.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from langchain import OpenAI
3
+ from langchain.chains.summarize import load_summarize_chain
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from load import parse_document
6
+
7
+ load_dotenv()
8
+
9
+ DOCUMENT_PATH = "data/raw/cixiidae"
10
+
11
+
12
+ llm = OpenAI(temperature=0)
13
+
14
+
15
+ def summarize(raw_documents):
16
+ text_splitter = RecursiveCharacterTextSplitter(
17
+ separators=["\n\n", "\n"], chunk_size=6000, chunk_overlap=300
18
+ )
19
+ docs = text_splitter.split_documents(raw_documents)
20
+
21
+ num_docs = len(docs)
22
+ num_tokens_first_doc = llm.get_num_tokens(docs[0].page_content)
23
+ print(
24
+ f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens"
25
+ )
26
+
27
+ summary_chain = load_summarize_chain(llm=llm, chain_type="map_reduce")
28
+ output = summary_chain.run(docs)
29
+
30
+ return output
31
+
32
+
33
+ def main():
34
+ raw_documents = parse_document(
35
+ "data/raw/cixiidae/Fulgoroidea2008-Mascareignes-AttiéBourgoinVeslotSoulier.pdf"
36
+ )
37
+ output = summarize(raw_documents)
38
+ print(output)
39
+ with open(
40
+ "data/processed/Fulgoroidea2008-Mascareignes-AttiéBourgoinVeslotSoulier-summary.txt",
41
+ "w",
42
+ ) as f:
43
+ f.write(output)
44
+
45
+
46
+ if __name__ == "__main__":
47
+ main()