markytools commited on
Commit
bb0cbef
·
1 Parent(s): 8f797dc

working code using open server api

Browse files
Files changed (7) hide show
  1. .gitattributes +35 -35
  2. .gitignore +3 -1
  3. EcommerceDataset.csv +0 -0
  4. README.md +13 -13
  5. app.py +200 -190
  6. packages.txt +1 -1
  7. requirements.txt +2 -7
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1 +1,3 @@
1
- .streamlit/
 
 
 
1
+ .streamlit/
2
+ app.ipynb
3
+ .ipynb_checkpoints/
EcommerceDataset.csv CHANGED
The diff for this file is too large to render. See raw diff
 
README.md CHANGED
@@ -1,13 +1,13 @@
1
- ---
2
- title: Pdfchat
3
- emoji: 💻
4
- colorFrom: purple
5
- colorTo: pink
6
- sdk: streamlit
7
- sdk_version: 1.26.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: Pdfchat
3
+ emoji: 💻
4
+ colorFrom: purple
5
+ colorTo: pink
6
+ sdk: streamlit
7
+ sdk_version: 1.26.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,190 +1,200 @@
1
- import streamlit as st
2
- from tempfile import NamedTemporaryFile
3
-
4
- import pprint
5
- import google.generativeai as palm
6
- import os
7
- from dotenv import load_dotenv, find_dotenv
8
- from langchain.embeddings import GooglePalmEmbeddings
9
- from langchain.llms import GooglePalm
10
-
11
- from langchain.document_loaders import UnstructuredURLLoader #load urls into docoument-loader
12
- from langchain.chains.question_answering import load_qa_chain
13
- from langchain.indexes import VectorstoreIndexCreator #vectorize db index with chromadb
14
- from langchain.text_splitter import CharacterTextSplitter #text splitter
15
- from langchain.chains import RetrievalQA
16
- from langchain.document_loaders import UnstructuredPDFLoader #load pdf
17
- from langchain.agents import create_pandas_dataframe_agent
18
- # from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
19
-
20
- import pandas as pd
21
- import numpy as np
22
- import pprint
23
-
24
- isPswdValid = False
25
- try:
26
- pswdVal = st.experimental_get_query_params()['pwd'][0]
27
- if pswdVal==st.secrets["PSWD"]:
28
- isPswdValid = True
29
- except:
30
- pass
31
-
32
- if not isPswdValid:
33
- st.write("Invalid Password")
34
- else:
35
- radioButtonList = ["E-commerce CSV (https://www.kaggle.com/datasets/mervemenekse/ecommerce-dataset)",
36
- "Upload my own CSV",
37
- "Upload my own PDF",
38
- "URL Chat with Google's Latest Earnings (https://abc.xyz/investor/)",
39
- "Enter my own URL"]
40
-
41
- # Add some designs to the radio buttons
42
- st.markdown("""
43
- <style>
44
- .stRadio {
45
- padding: 10px;
46
- border-radius: 5px;
47
- background-color: #f5f5f5;
48
- }
49
-
50
- .stRadio input[type="radio"] {
51
- position: absolute;
52
- opacity: 0;
53
- cursor: pointer;
54
- }
55
-
56
- .stRadio label {
57
- display: flex;
58
- justify-content: center;
59
- align-items: center;
60
- cursor: pointer;
61
- font-size: 16px;
62
- color: #333;
63
- }
64
-
65
- .stRadio label:hover {
66
- color: #000;
67
- }
68
-
69
- .stRadio.st-selected input[type="radio"] ~ label {
70
- color: #000;
71
- background-color: #d9d9d9;
72
- }
73
- </style>
74
- """, unsafe_allow_html=True)
75
-
76
- genre = st.radio(
77
- "Tired of reading your files? Chat with it using AI! Choose dataset to finetune", radioButtonList, index=0
78
- )
79
-
80
- # Initialize language model
81
- load_dotenv(find_dotenv()) # read local .env file
82
- api_key = st.secrets["PALM_API_KEY"] # put your API key here
83
- os.environ["GOOGLE_API_KEY"] = st.secrets["PALM_API_KEY"]
84
- palm.configure(api_key=api_key)
85
- llm = GooglePalm()
86
- llm.temperature = 0.1
87
-
88
- pdfCSVURLText = ""
89
- if genre==radioButtonList[0]:
90
- pdfCSVURLText = "CSV"
91
- exampleQuestion = "Question1: What was the most sold item? Question2: What was the most common payment?"
92
- dataDF = pd.read_csv('EcommerceDataset.csv', encoding= 'unicode_escape')
93
- # st.write('You selected comedy.')
94
- # else:
95
- # st.write(f'''Password streamlit app: {st.secrets["PSWD"]}''')
96
- elif genre==radioButtonList[1]:
97
- pdfCSVURLText = "CSV"
98
- exampleQuestion = "What are the data columns?"
99
- elif genre==radioButtonList[2]:
100
- pdfCSVURLText = "PDF"
101
- exampleQuestion = "Can you summarize the contents?"
102
- elif genre==radioButtonList[3]:
103
- pdfCSVURLText = "URL"
104
- exampleQuestion = "What is Google's latest earnings?"
105
- urls = ['https://abc.xyz/investor/']
106
- loader = [UnstructuredURLLoader(urls=urls)]
107
- index = VectorstoreIndexCreator(
108
- embedding=GooglePalmEmbeddings(),
109
- text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loader)
110
-
111
- chain = RetrievalQA.from_chain_type(llm=llm,
112
- chain_type="stuff",
113
- retriever=index.vectorstore.as_retriever(),
114
- input_key="question")
115
- elif genre==radioButtonList[4]:
116
- pdfCSVURLText = "URL"
117
- exampleQuestion = "Can you summarize the contents?"
118
-
119
- isCustomURL = genre==radioButtonList[4]
120
- urlInput = st.text_input('Enter your own URL', '', placeholder="Type your URL here (e.g. https://abc.xyz/investor/)", disabled=not isCustomURL)
121
-
122
- isCustomPDF = genre==radioButtonList[1] or genre==radioButtonList[2]
123
- uploaded_file = st.file_uploader(f"Upload your own {pdfCSVURLText} here", type=pdfCSVURLText.lower(), disabled=not isCustomPDF)
124
- uploadedFilename = ""
125
- if uploaded_file is not None:
126
- with NamedTemporaryFile(dir='.', suffix=f'.{pdfCSVURLText.lower()}') as f:
127
- f.write(uploaded_file.getbuffer())
128
- uploadedFilename = f.name
129
- if genre==radioButtonList[1]: # Custom CSV Upload
130
- dataDF = pd.read_csv(uploadedFilename, encoding= 'unicode_escape')
131
- elif genre==radioButtonList[2]: # Custom PDF Upload
132
- pdf_loaders = [UnstructuredPDFLoader(uploadedFilename)]
133
- pdf_index = VectorstoreIndexCreator(
134
- embedding=GooglePalmEmbeddings(),
135
- text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(pdf_loaders)
136
- pdf_chain = RetrievalQA.from_chain_type(llm=llm,
137
- chain_type="stuff",
138
- retriever=pdf_index.vectorstore.as_retriever(),
139
- input_key="question")
140
-
141
- enableChatBox = False
142
- if genre==radioButtonList[0]: # E-commerce CSV
143
- enableChatBox = True
144
- elif genre==radioButtonList[1]: # Custom CSV Upload
145
- enableChatBox = uploadedFilename[-4:]==".csv"
146
- elif genre==radioButtonList[2]: # Custom PDF Upload
147
- enableChatBox = uploadedFilename[-4:]==".pdf"
148
- elif genre==radioButtonList[3]: # Google Alphabet URL Earnings Report
149
- enableChatBox = True
150
- elif genre==radioButtonList[4]: # Custom URL
151
- enableChatBox = True
152
-
153
- chatTextStr = st.text_input(f'Ask me anything about this {pdfCSVURLText}', '', placeholder=f"Type here (e.g. {exampleQuestion})", disabled=not enableChatBox)
154
- chatWithPDFButton = "CLICK HERE TO START CHATTING"
155
- if st.button(chatWithPDFButton, disabled=not enableChatBox and not chatTextStr): # Button Cliked
156
-
157
-
158
- if genre==radioButtonList[0]: # E-commerce CSV
159
- # Initializing the agent
160
- agent = create_pandas_dataframe_agent(llm, dataDF, verbose=False)
161
- answer = agent.run(chatTextStr)
162
- st.write(answer)
163
-
164
- elif genre==radioButtonList[1]: # Custom CSV Upload
165
- # Initializing the agent
166
- agent = create_pandas_dataframe_agent(llm, dataDF, verbose=False)
167
- answer = agent.run(chatTextStr)
168
- st.write(answer)
169
-
170
- elif genre==radioButtonList[2]: # Custom PDF Upload
171
- pdf_answer = pdf_chain.run(chatTextStr)
172
- st.write(pdf_answer)
173
-
174
- elif genre==radioButtonList[3]: # Google Alphabet URL Earnings Report
175
- answer = chain.run(chatTextStr)
176
- st.write(answer)
177
-
178
- elif genre==radioButtonList[4]: # Custom URL
179
- urls = [urlInput]
180
- loader = [UnstructuredURLLoader(urls=urls)]
181
- index = VectorstoreIndexCreator(
182
- embedding=GooglePalmEmbeddings(),
183
- text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loader)
184
-
185
- chain = RetrievalQA.from_chain_type(llm=llm,
186
- chain_type="stuff",
187
- retriever=index.vectorstore.as_retriever(),
188
- input_key="question")
189
- answer = chain.run(chatTextStr)
190
- st.write(answer)
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from tempfile import NamedTemporaryFile
3
+
4
+ import pprint
5
+ import os
6
+ from dotenv import load_dotenv, find_dotenv
7
+ import os
8
+ from langchain_openai import ChatOpenAI
9
+ from langchain_community.document_loaders import PyPDFLoader
10
+ from langchain.document_loaders.csv_loader import CSVLoader
11
+ from langchain.document_loaders import WebBaseLoader
12
+
13
+ import pandas as pd
14
+ import numpy as np
15
+ import pprint
16
+
17
+ defaultGoogleURL = "https://www.google.com/search?q=google+earnings"
18
+ OPEN_ROUTER_KEY = st.secrets["OPEN_ROUTER_KEY"]
19
+ OPEN_ROUTER_MODEL = "meta-llama/llama-3.1-70b-instruct:free"
20
+
21
+ def pretty_print_columns(text):
22
+ """
23
+ Beautifies the provided CSV column description text.
24
+
25
+ Args:
26
+ text (str): The input string containing the column descriptions.
27
+
28
+ Returns:
29
+ str: The beautified string with neatly formatted column descriptions.
30
+ """
31
+ return " ".join([line.strip() for line in text.splitlines() if line.strip()])
32
+
33
+ isPswdValid = True
34
+ try:
35
+ pswdVal = st.experimental_get_query_params()['pwd'][0]
36
+ if pswdVal==st.secrets["PSWD"]:
37
+ isPswdValid = True
38
+ except:
39
+ pass
40
+
41
+ if not isPswdValid:
42
+ st.write("Invalid Password")
43
+ else:
44
+ radioButtonList = ["E-commerce CSV (https://www.kaggle.com/datasets/mervemenekse/ecommerce-dataset)",
45
+ "Upload my own CSV",
46
+ "Upload my own PDF",
47
+ f"URL Chat with Google's Latest Earnings ({defaultGoogleURL})",
48
+ "Enter my own URL"]
49
+
50
+ # Add some designs to the radio buttons
51
+ st.markdown("""
52
+ <style>
53
+ .stRadio {
54
+ padding: 10px;
55
+ border-radius: 5px;
56
+ background-color: #f5f5f5;
57
+ }
58
+
59
+ .stRadio input[type="radio"] {
60
+ position: absolute;
61
+ opacity: 0;
62
+ cursor: pointer;
63
+ }
64
+
65
+ .stRadio label {
66
+ display: flex;
67
+ justify-content: center;
68
+ align-items: center;
69
+ cursor: pointer;
70
+ font-size: 16px;
71
+ color: #333;
72
+ }
73
+
74
+ .stRadio label:hover {
75
+ color: #000;
76
+ }
77
+
78
+ .stRadio.st-selected input[type="radio"] ~ label {
79
+ color: #000;
80
+ background-color: #d9d9d9;
81
+ }
82
+ </style>
83
+ """, unsafe_allow_html=True)
84
+
85
+ genre = st.radio(
86
+ "Tired of reading your files? Chat with it using AI! Choose dataset to finetune", radioButtonList, index=0
87
+ )
88
+
89
+ # Initialize language model
90
+ load_dotenv(find_dotenv()) # read local .env file
91
+ llm = ChatOpenAI(model=OPEN_ROUTER_MODEL, temperature=0.1, openai_api_key=OPEN_ROUTER_KEY, openai_api_base="https://openrouter.ai/api/v1")
92
+
93
+ pdfCSVURLText = ""
94
+ if genre==radioButtonList[0]:
95
+ pdfCSVURLText = "CSV"
96
+ exampleQuestion = "Question1: What was the most sold item? Question2: What was the most common payment?"
97
+ loader = CSVLoader(file_path='EcommerceDataset.csv')
98
+ csv_data = loader.load()
99
+ # st.write('You selected comedy.')
100
+ # else:
101
+ # st.write(f'''Password streamlit app: {st.secrets["PSWD"]}''')
102
+ elif genre==radioButtonList[1]:
103
+ pdfCSVURLText = "CSV"
104
+ exampleQuestion = "What are the data columns?"
105
+ elif genre==radioButtonList[2]:
106
+ pdfCSVURLText = "PDF"
107
+ exampleQuestion = "Can you summarize the contents?"
108
+ elif genre==radioButtonList[3]:
109
+ pdfCSVURLText = "URL"
110
+ exampleQuestion = "What is Google's latest earnings?"
111
+ elif genre==radioButtonList[4]:
112
+ pdfCSVURLText = "URL"
113
+ exampleQuestion = "Can you summarize the contents?"
114
+
115
+ isCustomURL = genre==radioButtonList[4]
116
+ urlInput = st.text_input('Enter your own URL', '', placeholder=f"Type your URL here (e.g. {defaultGoogleURL})", disabled=not isCustomURL)
117
+
118
+ isCustomPDF = genre==radioButtonList[1] or genre==radioButtonList[2]
119
+ uploaded_file = st.file_uploader(f"Upload your own {pdfCSVURLText} here", type=pdfCSVURLText.lower(), disabled=not isCustomPDF)
120
+ uploadedFilename = ""
121
+ if uploaded_file is not None:
122
+ with NamedTemporaryFile(dir='.', suffix=f'.{pdfCSVURLText.lower()}') as f:
123
+ f.write(uploaded_file.getbuffer())
124
+ uploadedFilename = f.name
125
+ if genre==radioButtonList[1]: # Custom CSV Upload
126
+ loader = CSVLoader(file_path=uploadedFilename)
127
+ csv_data = loader.load()
128
+ elif genre==radioButtonList[2]: # Custom PDF Upload
129
+ loader = PyPDFLoader(uploadedFilename)
130
+ pdf_pages = loader.load_and_split()
131
+
132
+ enableChatBox = False
133
+ if genre==radioButtonList[0]: # E-commerce CSV
134
+ enableChatBox = True
135
+ elif genre==radioButtonList[1]: # Custom CSV Upload
136
+ enableChatBox = uploadedFilename[-4:]==".csv"
137
+ elif genre==radioButtonList[2]: # Custom PDF Upload
138
+ enableChatBox = uploadedFilename[-4:]==".pdf"
139
+ elif genre==radioButtonList[3]: # Google Alphabet URL Earnings Report
140
+ enableChatBox = True
141
+ elif genre==radioButtonList[4]: # Custom URL
142
+ enableChatBox = True
143
+
144
+ chatTextStr = st.text_input(f'Ask me anything about this {pdfCSVURLText}', '', placeholder=f"Type here (e.g. {exampleQuestion})", disabled=not enableChatBox)
145
+ chatWithPDFButton = "CLICK HERE TO START CHATTING"
146
+ if st.button(chatWithPDFButton, disabled=not enableChatBox and not chatTextStr): # Button Cliked
147
+ if genre==radioButtonList[0]: # E-commerce CSV
148
+ # Initializing the agent
149
+ answer = llm.predict(f'''
150
+ I have CSV file contents below:
151
+
152
+ {str(csv_data)}
153
+
154
+ {chatTextStr}
155
+ ''')
156
+ st.write(answer)
157
+
158
+ elif genre==radioButtonList[1]: # Custom CSV Upload
159
+ # Initializing the agent
160
+ answer = llm.predict(f'''
161
+ I have CSV file contents below:
162
+
163
+ {str(csv_data)}
164
+
165
+ {chatTextStr}
166
+ ''')
167
+ st.write(answer)
168
+
169
+ elif genre==radioButtonList[2]: # Custom PDF Upload
170
+ pdf_answer = llm.predict(f'''
171
+ I have PDF file contents below:
172
+
173
+ {str(pdf_pages)}
174
+
175
+ {chatTextStr}
176
+ ''')
177
+ st.write(pdf_answer)
178
+ elif genre==radioButtonList[3]: # Google Alphabet URL Earnings Report
179
+ loader = WebBaseLoader(defaultGoogleURL)
180
+ web_data = loader.load()
181
+ answer = llm.predict(f'''
182
+ I have website contents below:
183
+
184
+ {str(web_data)}
185
+
186
+ {chatTextStr}
187
+ ''')
188
+
189
+ st.write(answer)
190
+ elif genre==radioButtonList[4]: # Custom URL
191
+ loader = WebBaseLoader(urlInput)
192
+ web_data = loader.load()
193
+ answer = llm.predict(f'''
194
+ I have website contents below:
195
+
196
+ {str(web_data)}
197
+
198
+ {chatTextStr}
199
+ ''')
200
+ st.write(answer)
packages.txt CHANGED
@@ -1 +1 @@
1
- libmagic-dev
 
1
+ libmagic-dev
requirements.txt CHANGED
@@ -1,7 +1,2 @@
1
- opencv-python
2
- google-generativeai
3
- langchain==0.0.310
4
- unstructured==0.10.20
5
- chromadb==0.4.14
6
- pdfminer.six
7
- pdf2image
 
1
+ pdf2image
2
+ langchain_openai==0.2.9