naveenvenkatesh commited on
Commit
970b086
1 Parent(s): c837a58

Update summary_extractor.py

Browse files
Files changed (1) hide show
  1. summary_extractor.py +16 -12
summary_extractor.py CHANGED
@@ -1,15 +1,14 @@
1
- # import openai
2
  import json
3
  from typing import Dict
4
  import os
5
  from typing import List
6
- from langchain.chat_models import ChatOpenAI
7
- from langchain.document_loaders import PyPDFLoader
8
  from langchain.chains.mapreduce import MapReduceChain
9
  from langchain.text_splitter import CharacterTextSplitter
10
  from langchain.chains.summarize import load_summarize_chain
11
  from langchain.prompts import PromptTemplate
12
-
13
 
14
  class Extractor:
15
 
@@ -20,13 +19,16 @@ class Extractor:
20
  config (dict): Configuration settings loaded from a JSON file.
21
  pdf_file_path (str): Path to the input PDF file.
22
  """
 
23
  def __init__(self):
24
  """
25
  Initialize the Extractor class.
26
  """
27
 
28
- # Set OpenAI API key
29
- # os.environ["OPENAI_API_KEY"] = ""
 
 
30
 
31
  def _document_loader(self,pdf_file_path) -> List[str]:
32
  """
@@ -36,7 +38,7 @@ class Extractor:
36
  List[str]: List of text content from each page.
37
  """
38
  try:
39
- loader = PyPDFLoader(pdf_file_path.name)
40
  pages = loader.load_and_split()
41
  return pages
42
 
@@ -54,15 +56,15 @@ class Extractor:
54
  try:
55
  # Load the document texts
56
  docs = self._document_loader(pdf_file_path)
57
-
58
  # Initialize the text splitter with specified chunk size and overlap
59
  text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
60
  chunk_size=1000, chunk_overlap=200
61
  )
62
-
63
  # Split the documents into chunks
64
  split_docs = text_splitter.split_documents(docs)
65
-
66
  # Return the list of split document chunks
67
  return split_docs
68
 
@@ -70,13 +72,15 @@ class Extractor:
70
  print(f"Error while splitting document text: {str(e)}")
71
 
72
 
73
- def _refine_summary(self,pdf_file_path) -> str:
 
74
  """
75
  Generate a refined summary of the document using language models.
76
 
77
  Returns:
78
  str: Refined summary text.
79
  """
 
80
  try:
81
  # Split documents into chunks for efficient processing
82
  split_docs = self._document_text_spilliter(pdf_file_path)
@@ -103,7 +107,7 @@ class Extractor:
103
 
104
  # Load the summarization chain using the ChatOpenAI language model
105
  chain = load_summarize_chain(
106
- llm = ChatOpenAI(temperature=0),
107
  chain_type="refine",
108
  question_prompt=prompt,
109
  refine_prompt=refine_prompt,
 
1
+ import openai
2
  import json
3
  from typing import Dict
4
  import os
5
  from typing import List
6
+ from langchain_openai import AzureChatOpenAI
 
7
  from langchain.chains.mapreduce import MapReduceChain
8
  from langchain.text_splitter import CharacterTextSplitter
9
  from langchain.chains.summarize import load_summarize_chain
10
  from langchain.prompts import PromptTemplate
11
+ from langchain_community.document_loaders import UnstructuredFileLoader
12
 
13
  class Extractor:
14
 
 
19
  config (dict): Configuration settings loaded from a JSON file.
20
  pdf_file_path (str): Path to the input PDF file.
21
  """
22
+
23
  def __init__(self):
24
  """
25
  Initialize the Extractor class.
26
  """
27
 
28
+ openai.api_type = os.getenv['api_type']
29
+ os.environ["AZURE_OPENAI_API_KEY"] = os.getenv['api_key']
30
+ os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv['api_base']
31
+ os.environ["OPENAI_API_VERSION"] = os.getenv['api_version']
32
 
33
  def _document_loader(self,pdf_file_path) -> List[str]:
34
  """
 
38
  List[str]: List of text content from each page.
39
  """
40
  try:
41
+ loader = UnstructuredFileLoader(pdf_file_path)
42
  pages = loader.load_and_split()
43
  return pages
44
 
 
56
  try:
57
  # Load the document texts
58
  docs = self._document_loader(pdf_file_path)
59
+
60
  # Initialize the text splitter with specified chunk size and overlap
61
  text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
62
  chunk_size=1000, chunk_overlap=200
63
  )
64
+
65
  # Split the documents into chunks
66
  split_docs = text_splitter.split_documents(docs)
67
+
68
  # Return the list of split document chunks
69
  return split_docs
70
 
 
72
  print(f"Error while splitting document text: {str(e)}")
73
 
74
 
75
+ def refine_summary(self,pdf_file_path) -> str:
76
+
77
  """
78
  Generate a refined summary of the document using language models.
79
 
80
  Returns:
81
  str: Refined summary text.
82
  """
83
+
84
  try:
85
  # Split documents into chunks for efficient processing
86
  split_docs = self._document_text_spilliter(pdf_file_path)
 
107
 
108
  # Load the summarization chain using the ChatOpenAI language model
109
  chain = load_summarize_chain(
110
+ llm = AzureChatOpenAI(azure_deployment = "ChatGPT"),
111
  chain_type="refine",
112
  question_prompt=prompt,
113
  refine_prompt=refine_prompt,