segestic commited on
Commit
17057cd
ยท
verified ยท
1 Parent(s): 4f8f322

Update util.py

Browse files
Files changed (1) hide show
  1. util.py +103 -101
util.py CHANGED
@@ -1,101 +1,103 @@
1
- from pypdf import PdfReader
2
- from langchain.text_splitter import RecursiveCharacterTextSplitter
3
- from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
4
- from langchain_community.embeddings.ollama import OllamaEmbeddings
5
- from langchain_community.embeddings.bedrock import BedrockEmbeddings
6
- from langchain_community.vectorstores import FAISS
7
- from langchain.chains.combine_documents import create_stuff_documents_chain
8
- from langchain.chains import create_retrieval_chain
9
- from dotenv import load_dotenv
10
- import streamlit as st
11
- import os
12
-
13
- load_dotenv()
14
-
15
-
16
- # Function to get the API key
17
- def get_api_key():
18
- # Try to get the API key from st.secrets first
19
- try:
20
- groq_api_key = os.getenv("GROQ_API_KEY", "")
21
-
22
- return groq_api_key
23
- except Exception as e:
24
- print(e)
25
-
26
- def get_inference_api_key():
27
- try:
28
- inference_api_key = os.getenv("INFERENCE_API_KEY", "")
29
-
30
- return inference_api_key
31
- except Exception as e:
32
- print(e)
33
-
34
-
35
- # Function for API configuration at sidebar
36
- def sidebar_api_key_configuration():
37
- groq_api_key = get_api_key()
38
- if groq_api_key == '':
39
- st.sidebar.warning('Enter the API Key(s) ๐Ÿ—๏ธ')
40
- st.session_state.prompt_activation = False
41
- elif (groq_api_key.startswith('gsk_') and (len(groq_api_key) == 56)):
42
- st.sidebar.success('Lets Proceed!', icon='๏ธ๐Ÿ‘‰')
43
- st.session_state.prompt_activation = True
44
- else:
45
- st.sidebar.warning('Please enter the correct API Key ๐Ÿ—๏ธ!', icon='โš ๏ธ')
46
- st.session_state.prompt_activation = False
47
- return groq_api_key
48
-
49
-
50
- def sidebar_groq_model_selection():
51
- st.sidebar.subheader("Model Selection")
52
- model = st.sidebar.selectbox('Select the Model', ('Llama3-8b-8192', 'Llama3-70b-8192', 'Mixtral-8x7b-32768',
53
- 'Gemma-7b-it'), label_visibility="collapsed")
54
- return model
55
-
56
-
57
- # Read PDF data
58
- def read_pdf_data(pdf_docs):
59
- text = ""
60
- for pdf in pdf_docs:
61
- pdf_reader = PdfReader(pdf)
62
- for page in pdf_reader.pages:
63
- text += page.extract_text()
64
- return text
65
-
66
-
67
- # Split data into chunks
68
- def split_data(text):
69
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
70
- text_chunks = text_splitter.split_text(text)
71
- return text_chunks
72
-
73
-
74
- def get_embedding_function():
75
- # embeddings = BedrockEmbeddings(
76
- # credentials_profile_name="default", region_name="us-east-1"
77
- # )
78
- #embeddings = OllamaEmbeddings(model="nomic-embed-text")
79
- inference_api_key = get_inference_api_key()
80
-
81
- embeddings = HuggingFaceInferenceAPIEmbeddings(
82
- api_key=inference_api_key, model_name="sentence-transformers/all-MiniLM-l6-v2"
83
- )
84
- return embeddings
85
-
86
-
87
- # Create vectorstore
88
- def create_vectorstore(pdf_docs):
89
- raw_text = read_pdf_data(pdf_docs) # Get PDF text
90
- text_chunks = split_data(raw_text) # Get the text chunks
91
- embeddings = get_embedding_function() # Get the embedding function
92
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
93
- return vectorstore
94
-
95
-
96
- # Get response from llm of user asked question
97
- def get_llm_response(llm, prompt, question):
98
- document_chain = create_stuff_documents_chain(llm, prompt)
99
- retrieval_chain = create_retrieval_chain(st.session_state.vector_store.as_retriever(), document_chain)
100
- response = retrieval_chain.invoke({'input': question})
101
- return response
 
 
 
1
+ from pypdf import PdfReader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
4
+ from langchain_community.embeddings.ollama import OllamaEmbeddings
5
+ from langchain_community.embeddings.bedrock import BedrockEmbeddings
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain.chains.combine_documents import create_stuff_documents_chain
8
+ from langchain.chains import create_retrieval_chain
9
+ from dotenv import load_dotenv
10
+ import streamlit as st
11
+ import os
12
+
13
+ load_dotenv()
14
+
15
+
16
+ # Function to get the API key
17
+ def get_api_key():
18
+ # Try to get the API key from st.secrets first
19
+ try:
20
+ groq_api_key = os.getenv("GROQ_API_KEY", "")
21
+
22
+ return groq_api_key
23
+ except Exception as e:
24
+ print(e)
25
+
26
+ def get_inference_api_key():
27
+ try:
28
+ inference_api_key = os.getenv("INFERENCE_API_KEY", "")
29
+
30
+ return inference_api_key
31
+ except Exception as e:
32
+ print(e)
33
+
34
+
35
+ # Function for API configuration at sidebar
36
+ def sidebar_api_key_configuration():
37
+ groq_api_key = get_api_key()
38
+ if groq_api_key == '':
39
+ st.sidebar.warning('Enter the API Key(s) ๐Ÿ—๏ธ')
40
+ st.session_state.prompt_activation = False
41
+ elif (groq_api_key.startswith('gsk_') and (len(groq_api_key) == 56)):
42
+ st.sidebar.success('Lets Proceed!', icon='๏ธ๐Ÿ‘‰')
43
+ st.session_state.prompt_activation = True
44
+ else:
45
+ st.sidebar.warning('Please enter the correct API Key ๐Ÿ—๏ธ!', icon='โš ๏ธ')
46
+ st.session_state.prompt_activation = False
47
+ return groq_api_key
48
+
49
+
50
+ def sidebar_groq_model_selection():
51
+ st.sidebar.subheader("Model Selection")
52
+ model = st.sidebar.selectbox('Select the Model', ('Llama3-8b-8192', 'Llama3-70b-8192', 'Mixtral-8x7b-32768',
53
+ 'Gemma-7b-it'), label_visibility="collapsed")
54
+ return model
55
+
56
+
57
+ # Read PDF data
58
+ def read_pdf_data(pdf_docs):
59
+ text = ""
60
+ for pdf in pdf_docs:
61
+ pdf_reader = PdfReader(pdf)
62
+ for page in pdf_reader.pages:
63
+ text += page.extract_text()
64
+ return text
65
+
66
+
67
+ # Split data into chunks
68
+ def split_data(text):
69
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
70
+ text_chunks = text_splitter.split_text(text)
71
+ return text_chunks
72
+
73
+
74
+ def get_embedding_function():
75
+ # embeddings = BedrockEmbeddings(
76
+ # credentials_profile_name="default", region_name="us-east-1"
77
+ # )
78
+ #embeddings = OllamaEmbeddings(model="nomic-embed-text")
79
+ inference_api_key = get_inference_api_key()
80
+
81
+ embeddings = HuggingFaceInferenceAPIEmbeddings(
82
+ api_key=inference_api_key, model_name="sentence-transformers/all-MiniLM-l6-v2"
83
+ )
84
+ return embeddings
85
+
86
+
87
+ # Create vectorstore
88
+ def create_vectorstore(pdf_docs):
89
+ raw_text = read_pdf_data(pdf_docs) # Get PDF text
90
+ text_chunks = split_data(raw_text) # Get the text chunks
91
+ embeddings = get_embedding_function() # Get the embedding function
92
+
93
+ # Pass the callable embedding function (embed_query) to FAISS
94
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings.embed_query)
95
+ return vectorstore
96
+
97
+
98
+ # Get response from llm of user asked question
99
+ def get_llm_response(llm, prompt, question):
100
+ document_chain = create_stuff_documents_chain(llm, prompt)
101
+ retrieval_chain = create_retrieval_chain(st.session_state.vector_store.as_retriever(), document_chain)
102
+ response = retrieval_chain.invoke({'input': question})
103
+ return response