Pijush2023 commited on
Commit
b43948a
·
verified ·
1 Parent(s): 940a41d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -1688
app.py CHANGED
@@ -1,1708 +1,87 @@
1
- from gradio_client import Client
2
  import gradio as gr
3
- import requests
4
- import os
5
- import time
6
- import re
7
- import logging
8
- import tempfile
9
- import folium
10
- import concurrent.futures
11
- import torch
12
- from PIL import Image
13
- from datetime import datetime
14
- from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
15
- from googlemaps import Client as GoogleMapsClient
16
- from gtts import gTTS
17
- from diffusers import StableDiffusionPipeline
18
- from langchain_openai import OpenAIEmbeddings, ChatOpenAI
19
- from langchain_pinecone import PineconeVectorStore
20
- from langchain.prompts import PromptTemplate
21
- from langchain.chains import RetrievalQA
22
- from langchain.chains.conversation.memory import ConversationBufferWindowMemory
23
- from huggingface_hub import login
24
- from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
25
- from parler_tts import ParlerTTSForConditionalGeneration
26
- from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
27
- from scipy.io.wavfile import write as write_wav
28
- from pydub import AudioSegment
29
- from string import punctuation
30
- import librosa
31
- from pathlib import Path
32
- import torchaudio
33
- import numpy as np
34
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
35
- from langchain_huggingface import HuggingFaceEmbeddings
36
- from langchain_community.document_loaders import PDFPlumberLoader
37
  import pdfplumber
38
-
39
-
40
- # Neo4j imports
41
- from langchain.chains import GraphCypherQAChain
42
- from langchain_community.graphs import Neo4jGraph
43
- from langchain_community.document_loaders import HuggingFaceDatasetLoader
44
- from langchain_text_splitters import CharacterTextSplitter
45
- from langchain_experimental.graph_transformers import LLMGraphTransformer
46
- from langchain_core.prompts import ChatPromptTemplate
47
- from langchain_core.pydantic_v1 import BaseModel, Field
48
- from langchain_core.messages import AIMessage, HumanMessage
49
- from langchain_core.output_parsers import StrOutputParser
50
- from langchain_core.runnables import RunnableBranch, RunnableLambda, RunnableParallel, RunnablePassthrough
51
- from serpapi.google_search import GoogleSearch
52
-
53
- #Parler TTS v1 Modules
54
-
55
  import os
56
- import re
57
- import tempfile
58
- import soundfile as sf
59
- from string import punctuation
60
- from pydub import AudioSegment
61
- from transformers import AutoTokenizer, AutoFeatureExtractor
62
-
63
-
64
-
65
- #API AutoDate Fix Up
66
- def get_current_date1():
67
- return datetime.now().strftime("%Y-%m-%d")
68
-
69
- # Usage
70
- current_date1 = get_current_date1()
71
-
72
-
73
-
74
- # Set environment variables for CUDA
75
- os.environ['PYTORCH_USE_CUDA_DSA'] = '1'
76
- os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
77
- os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
78
-
79
-
80
- hf_token = os.getenv("HF_TOKEN")
81
- if hf_token is None:
82
- print("Please set your Hugging Face token in the environment variables.")
83
- else:
84
- login(token=hf_token)
85
-
86
- logging.basicConfig(level=logging.DEBUG)
87
-
88
-
89
- from langchain.embeddings.huggingface import HuggingFaceEmbeddings
90
- gpt_embeddings = OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
91
-
92
- # Initialize HuggingFaceEmbeddings properly
93
- # embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
94
- embeddings=gpt_embeddings
95
-
96
-
97
-
98
-
99
- #Initialization
100
-
101
- # Initialize the models
102
- def initialize_phi_model():
103
- model = AutoModelForCausalLM.from_pretrained(
104
- "microsoft/Phi-3.5-mini-instruct",
105
- device_map="cuda",
106
- torch_dtype="auto",
107
- trust_remote_code=True,
108
  )
109
- tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")
110
- return pipeline("text-generation", model=model, tokenizer=tokenizer)
111
-
112
- def initialize_gpt_model():
113
- return ChatOpenAI(api_key=os.environ['OPENAI_API_KEY'], temperature=0, model='gpt-4o')
114
-
115
- def initialize_gpt4o_mini_model():
116
- return ChatOpenAI(api_key=os.environ['OPENAI_API_KEY'], temperature=0, model='gpt-4o-mini')
117
-
118
-
119
-
120
-
121
-
122
-
123
- # Initialize all models
124
- phi_pipe = initialize_phi_model()
125
- gpt_model = initialize_gpt_model()
126
- gpt4o_mini_model = initialize_gpt4o_mini_model()
127
-
128
-
129
-
130
-
131
- # Existing embeddings and vector store for GPT-4o
132
- gpt_embeddings = OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
133
- gpt_vectorstore = PineconeVectorStore(index_name="italy-pdf", embedding=gpt_embeddings)
134
- gpt_retriever = gpt_vectorstore.as_retriever(search_kwargs={'k': 5})
135
-
136
- # New vector store setup for Phi-3.5
137
- phi_embeddings = embeddings
138
- phi_vectorstore = PineconeVectorStore(index_name="italy-pdf", embedding=embeddings)
139
- phi_retriever = phi_vectorstore.as_retriever(search_kwargs={'k': 5})
140
-
141
-
142
-
143
-
144
-
145
- # Pinecone setup
146
- from pinecone import Pinecone
147
- pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])
148
-
149
- # index_name = "italyopenai"
150
- index_name = "italy-pdf"
151
- vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)
152
- retriever = vectorstore.as_retriever(search_kwargs={'k': 5})
153
-
154
- chat_model = ChatOpenAI(api_key=os.environ['OPENAI_API_KEY'], temperature=0, model='gpt-4o')
155
- chat_model1 = ChatOpenAI(api_key=os.environ['OPENAI_API_KEY'], temperature=0, model='gpt-4o-mini')
156
- conversational_memory = ConversationBufferWindowMemory(
157
- memory_key='chat_history',
158
- k=10,
159
- return_messages=True
160
- )
161
-
162
- # Prompt templates
163
- def get_current_date():
164
- return datetime.now().strftime("%B %d, %Y")
165
-
166
- current_date = get_current_date()
167
-
168
- template1 = f"""Sei un esperto della lingua italiana e un madrelingua italiano con una profonda comprensione della comunicazione concisa. Eccelli nell'estrarre informazioni e nel presentarle in modo chiaro e diretto per facilitarne la comprensione e l'utilità.
169
- Il tuo compito è fornire risposte in base al documento fornito. Dovresti restituire le informazioni nel seguente formato:
170
- - Nome documento: devi dare il nome del pdf.
171
- - Numero di pagina:
172
- - Le prime 5 risposte: stampa le prime 5 risposte correlate in base al contesto
173
- - Risposta effettiva: In breve
174
- Tieni presente che la chiarezza e la brevità sono essenziali e dovresti fornire solo i dettagli richiesti senza ulteriori commenti. Se non riesci a trovare la risposta nel documento, rispondi semplicemente con "Questa domanda va oltre la mia conoscenza".
175
- {{context}}
176
- Question: {{question}}
177
- Helpful Answer:"""
178
-
179
- # template2 =f"""Hello there! As your friendly and knowledgeable guide here in Birmingham, Alabama . I'm here to help you discover the best experiences this beautiful city has to offer. It's a bright and sunny day today, {current_date}, and I’m excited to assist you with any insights or recommendations you need.
180
- # Whether you're looking for local events, sports ,clubs,concerts etc or just a great place to grab a bite, I've got you covered.Keep your response casual, short and sweet for the quickest response.Don't reveal the location and give the response in a descriptive way, I'm here to help make your time in Birmingham unforgettable!
181
- # "It’s always a pleasure to assist you!"
182
- # {{context}}
183
- # Question: {{question}}
184
- # Helpful Answer:"""
185
-
186
- template2=f"""Sei un esperto della lingua italiana e un madrelingua italiano con una profonda comprensione della comunicazione concisa. Eccelli nell'estrarre informazioni e nel presentarle in modo chiaro e diretto per facilitarne la comprensione e l'utilità.
187
- Il tuo compito è fornire risposte in base al documento fornito. Dovresti restituire le informazioni nel seguente formato:
188
- - Nome documento: devi dare il nome del pdf.
189
- - Numero di pagina:
190
- - Le prime 5 risposte: stampa le prime 5 risposte correlate in base al contesto
191
- - Risposta effettiva: In breve
192
- Tieni presente che la chiarezza e la brevità sono essenziali e dovresti fornire solo i dettagli richiesti senza ulteriori commenti. Se non riesci a trovare la risposta nel documento, rispondi semplicemente con "Questa domanda va oltre la mia conoscenza".
193
- {{context}}
194
- Question: {{question}}
195
- Helpful Answer:"""
196
-
197
- QA_CHAIN_PROMPT_1 = PromptTemplate(input_variables=["context", "question"], template=template1)
198
- QA_CHAIN_PROMPT_2 = PromptTemplate(input_variables=["context", "question"], template=template2)
199
-
200
- # Neo4j setup
201
- # graph = Neo4jGraph(url="neo4j+s://6457770f.databases.neo4j.io",
202
- # username="neo4j",
203
- # password="Z10duoPkKCtENuOukw3eIlvl0xJWKtrVSr-_hGX1LQ4"
204
- # )
205
- # Avoid pushing the graph documents to Neo4j every time
206
- # Only push the documents once and comment the code below after the initial push
207
- # dataset_name = "Pijush2023/birmindata07312024"
208
- # page_content_column = 'events_description'
209
- # loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)
210
- # data = loader.load()
211
-
212
- # text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=50)
213
- # documents = text_splitter.split_documents(data)
214
-
215
- # llm_transformer = LLMGraphTransformer(llm=chat_model)
216
- # graph_documents = llm_transformer.convert_to_graph_documents(documents)
217
- # graph.add_graph_documents(graph_documents, baseEntityLabel=True, include_source=True)
218
-
219
-
220
- #-------------------------------Comment Out------------------------------------------------------------------------------------------------------------------------
221
-
222
- # class Entities(BaseModel):
223
- # names: list[str] = Field(..., description="All the person, organization, or business entities that appear in the text")
224
-
225
- # entity_prompt = ChatPromptTemplate.from_messages([
226
- # ("system", "You are extracting organization and person entities from the text."),
227
- # ("human", "Use the given format to extract information from the following input: {question}"),
228
- # ])
229
-
230
- # entity_chain = entity_prompt | chat_model.with_structured_output(Entities)
231
-
232
- # def remove_lucene_chars(input: str) -> str:
233
- # return input.translate(str.maketrans({"\\": r"\\", "+": r"\+", "-": r"\-", "&": r"\&", "|": r"\|", "!": r"\!",
234
- # "(": r"\(", ")": r"\)", "{": r"\{", "}": r"\}", "[": r"\[", "]": r"\]",
235
- # "^": r"\^", "~": r"\~", "*": r"\*", "?": r"\?", ":": r"\:", '"': r'\"',
236
- # ";": r"\;", " ": r"\ "}))
237
-
238
- # def generate_full_text_query(input: str) -> str:
239
- # full_text_query = ""
240
- # words = [el for el in remove_lucene_chars(input).split() if el]
241
- # for word in words[:-1]:
242
- # full_text_query += f" {word}~2 AND"
243
- # full_text_query += f" {words[-1]}~2"
244
- # return full_text_query.strip()
245
-
246
- # def structured_retriever(question: str) -> str:
247
- # result = ""
248
- # entities = entity_chain.invoke({"question": question})
249
- # for entity in entities.names:
250
- # response = graph.query(
251
- # """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
252
- # YIELD node,score
253
- # CALL {
254
- # WITH node
255
- # MATCH (node)-[r:!MENTIONS]->(neighbor)
256
- # RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
257
- # UNION ALL
258
- # WITH node
259
- # MATCH (node)<-[r:!MENTIONS]-(neighbor)
260
- # RETURN neighbor.id + ' - ' + type(r) + ' -> ' + node.id AS output
261
- # }
262
- # RETURN output LIMIT 50
263
- # """,
264
- # {"query": generate_full_text_query(entity)},
265
- # )
266
- # result += "\n".join([el['output'] for el in response])
267
- # return result
268
-
269
- # def retriever_neo4j(question: str):
270
- # structured_data = structured_retriever(question)
271
- # logging.debug(f"Structured data: {structured_data}")
272
- # return structured_data
273
-
274
- # _template = """Given the following conversation and a follow-up question, rephrase the follow-up question to be a standalone question,
275
- # in its original language.
276
- # Chat History:
277
- # {chat_history}
278
- # Follow Up Input: {question}
279
- # Standalone question:"""
280
-
281
- # CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
282
-
283
- # def _format_chat_history(chat_history: list[tuple[str, str]]) -> list:
284
- # buffer = []
285
- # for human, ai in chat_history:
286
- # buffer.append(HumanMessage(content=human))
287
- # buffer.append(AIMessage(content=ai))
288
- # return buffer
289
-
290
- # _search_query = RunnableBranch(
291
- # (
292
- # RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
293
- # run_name="HasChatHistoryCheck"
294
- # ),
295
- # RunnablePassthrough.assign(
296
- # chat_history=lambda x: _format_chat_history(x["chat_history"])
297
- # )
298
- # | CONDENSE_QUESTION_PROMPT
299
- # | ChatOpenAI(temperature=0, api_key=os.environ['OPENAI_API_KEY'])
300
- # | StrOutputParser(),
301
- # ),
302
- # RunnableLambda(lambda x : x["question"]),
303
- # )
304
-
305
- # # template = """Answer the question based only on the following context:
306
- # # {context}
307
- # # Question: {question}
308
- # # Use natural language and be concise.
309
- # # Answer:"""
310
-
311
- # template = f"""As an expert concierge known for being helpful and a renowned guide for Birmingham, Alabama, I assist visitors in discovering the best that the city has to offer.I also assist the visitors about various sports and activities. Given today's sunny and bright weather on {current_date}, I am well-equipped to provide valuable insights and recommendations without revealing specific locations. I draw upon my extensive knowledge of the area, including perennial events and historical context.
312
- # In light of this, how can I assist you today? Feel free to ask any questions or seek recommendations for your day in Birmingham. If there's anything specific you'd like to know or experience, please share, and I'll be glad to help. Remember, keep the question concise for a quick,short ,crisp and accurate response.
313
- # "It was my pleasure!"
314
- # {{context}}
315
- # Question: {{question}}
316
- # Helpful Answer:"""
317
-
318
- # qa_prompt = ChatPromptTemplate.from_template(template)
319
-
320
- # chain_neo4j = (
321
- # RunnableParallel(
322
- # {
323
- # "context": _search_query | retriever_neo4j,
324
- # "question": RunnablePassthrough(),
325
- # }
326
- # )
327
- # | qa_prompt
328
- # | chat_model
329
- # | StrOutputParser()
330
- # )
331
-
332
-
333
-
334
-
335
-
336
-
337
-
338
- # phi_custom_template = """
339
- # <|system|>
340
- # Sei un esperto della lingua italiana e un madrelingua italiano. Il tuo compito è fornire risposte concise, dirette e brevi basate sul documento fornito. Non dovresti dare risposte personali o interpretative.
341
- # Fornisci dettagli sul documento che sto per condividere, come il nome del documento, il numero di pagina e altre informazioni specifiche in modo molto breve e diretto. Se non riesci a trovare la risposta, rispondi semplicemente con "Questa domanda è al di là delle mie conoscenze".
342
- # Ecco i dettagli del documento da considerare:
343
- # - Nome del documento:
344
- # - Pagina:
345
- # - Altre informazioni richieste:.<|end|>
346
- # <|user|>
347
- # {context}
348
- # Question: {question}<|end|>
349
- # <|assistant|>
350
- # Sure! Here's the information:
351
- # """
352
-
353
- # phi_custom_template = """
354
- # <|system|>
355
- # Sei un esperto della lingua italiana e un madrelingua italiano. Il tuo compito è fornire risposte concise, dirette e brevi basate sul documento fornito. Dovresti restituire le informazioni nel seguente formato:
356
- # - Nome del documento: (il nome del documento)
357
- # - Numero di pagina: (numero di pagina)
358
- # - Contenuto effettivo: (contenuto rilevante del documento)
359
- # Alla fine, fornisci una sezione separata per la risposta nel seguente formato:
360
- # - Risposta: (la risposta alla domanda)
361
- # Se non riesci a trovare la risposta nel documento, rispondi semplicemente con "Questa domanda è al di là delle mie conoscenze". Ecco i dettagli del documento da considerare:
362
- # <|end|>
363
- # <|user|>
364
- # {context}
365
- # Question: {question}<|end|>
366
- # <|assistant|>
367
- # Sure! The Responses are as follows:
368
- # """
369
- phi_custom_template = """
370
- <|system|>
371
- Sei un esperto della lingua italiana e un madrelingua italiano. Il tuo compito è fornire risposte concise, basate esclusivamente sul documento fornito. Restituisci le informazioni nel seguente formato:
372
- - Nome del documento: (il nome del documento)
373
- - Numero di pagina: (numero di pagina)
374
- - Contenuto: (contenuto rilevante del documento)
375
- Alla fine, fornisci una risposta nel seguente formato:
376
- - Risposta: (la risposta alla domanda)
377
- Se la risposta non è presente nel documento, rispondi con: "Questa domanda è al di là delle mie conoscenze".
378
- <|end|>
379
- <|user|>
380
- {context}
381
- Domanda: {question}
382
- <|end|>
383
- <|assistant|>
384
- """
385
-
386
-
387
-
388
- # phi_custom_template = """
389
- # <|system|>
390
- # Sei un esperto della lingua italiana e un madrelingua italiano. Il tuo compito è fornire risposte concise, dirette e brevi basate sul documento fornito. Dovresti restituire le informazioni nel seguente formato:
391
- # - Nome del documento: (il nome del documento)
392
- # - Numero di pagina: (numero di pagina)
393
- # - Contenuto effettivo: (contenuto rilevante del documento)
394
- # Alla fine, fornisci una sezione separata per la risposta nel seguente formato:
395
- # - Risposta: (la risposta alla domanda)
396
- # Se non riesci a trovare la risposta nel documento, rispondi semplicemente con "Questa domanda è al di là delle mie conoscenze". Ecco i dettagli del documento da considerare:
397
- # <|end|>
398
- # <|user|>
399
- # {context}
400
- # Question: {question}<|end|>
401
- # <|assistant|>
402
- # Sure! The Responses are as follows:
403
-
404
- # <|end|>
405
- # <|user|>
406
- # {context}
407
- # Question: {question}<|end|>
408
- # <|assistant|>
409
- # Sure! The Responses are as follows:
410
- # """
411
-
412
- def generate_bot_response(history, choice, retrieval_mode, model_choice):
413
- if not history:
414
- return
415
-
416
- # Select the model
417
- # selected_model = chat_model if model_choice == "LM-1" else phi_pipe
418
- selected_model = chat_model if model_choice == "LM-1" else (chat_model1 if model_choice == "LM-3" else phi_pipe)
419
-
420
-
421
- response, addresses = generate_answer(history[-1][0], choice, retrieval_mode, selected_model)
422
- history[-1][1] = ""
423
-
424
- for character in response:
425
- history[-1][1] += character
426
- yield history # Stream each character as it is generated
427
- time.sleep(0.05) # Add a slight delay to simulate streaming
428
-
429
- yield history # Final yield with the complete response
430
-
431
-
432
- def generate_tts_response(history, tts_choice):
433
- # Get the most recent bot response from the chat history
434
- if history and len(history) > 0:
435
- recent_response = history[-1][1] # The second item in the tuple is the bot response text
436
- else:
437
- recent_response = ""
438
 
439
- # Call the TTS function for the recent response
440
- with concurrent.futures.ThreadPoolExecutor() as executor:
441
- if tts_choice == "Alpha":
442
- audio_future = executor.submit(generate_audio_elevenlabs, recent_response)
443
- elif tts_choice == "Beta":
444
- audio_future = executor.submit(generate_audio_parler_tts, recent_response)
445
 
446
- audio_path = audio_future.result()
447
- return audio_path
448
-
449
-
450
-
451
-
452
-
453
-
454
-
455
- import concurrent.futures
456
- # Existing bot function with concurrent futures for parallel processing
457
- def bot(history, choice, tts_choice, retrieval_mode, model_choice):
458
- # Initialize an empty response
459
- response = ""
460
-
461
- # Create a thread pool to handle both text generation and TTS conversion in parallel
462
- with concurrent.futures.ThreadPoolExecutor() as executor:
463
- # Start the bot response generation in parallel
464
- bot_future = executor.submit(generate_bot_response, history, choice, retrieval_mode, model_choice)
465
-
466
- # Wait for the text generation to start
467
- for history_chunk in bot_future.result():
468
- response = history_chunk[-1][1] # Update the response with the current state
469
- yield history_chunk, None # Stream the text output as it's generated
470
-
471
- # Once text is fully generated, start the TTS conversion
472
- tts_future = executor.submit(generate_tts_response, response, tts_choice)
473
-
474
- # Get the audio output after TTS is done
475
- audio_path = tts_future.result()
476
-
477
- # Stream the final text and audio output
478
- yield history, audio_path
479
-
480
-
481
-
482
-
483
-
484
-
485
- import re
486
-
487
-
488
- # def clean_response(response_text):
489
- # # Remove system and user tags
490
- # response_text = re.sub(r'<\|system\|>.*?<\|end\|>', '', response_text, flags=re.DOTALL)
491
- # response_text = re.sub(r'<\|user\|>.*?<\|end\|>', '', response_text, flags=re.DOTALL)
492
- # response_text = re.sub(r'<\|assistant\|>', '', response_text, flags=re.DOTALL)
493
-
494
- # # Extract the document name and page number
495
- # document_match = re.search(r"Document\(metadata=\{'source':'(.+?)','page':(\d+)\}", response_text)
496
- # # document_match = re.search(r"Document\(metadata=\{'source':'(.+?)','page':(\d+)\}", response_text)
497
- # if document_match:
498
- # document_name = document_match.group(1).split('/')[-1] # Get the document name
499
- # page_number = document_match.group(2) # Get the page number
500
- # else:
501
- # document_name = "Unknown"
502
- # page_number = "Unknown"
503
-
504
- # # Remove the entire 'Document(metadata=...' and any mention of it from the response
505
- # response_text = re.sub(r'Document\(metadata=\{.*?\},page_content=', '', response_text, flags=re.DOTALL)
506
 
507
- # # Remove any mention of "Document:" in the response
508
- # response_text = re.sub(r'- Document:.*', '', response_text)
509
-
510
- # # Remove any unwanted escape characters like \u and \u00
511
- # response_text = re.sub(r'\\u[0-9A-Fa-f]{4}', '', response_text)
512
-
513
- # # Ensure proper spacing between words and dates
514
- # response_text = re.sub(r'([a-zA-Z])(\d)', r'\1 \2', response_text)
515
- # response_text = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', response_text)
516
-
517
- # # Remove the phrase "Sure! The Responses are as follows:" from the actual content
518
- # response_text = re.sub(r'Sure! The Responses are as follows:', '', response_text).strip()
519
-
520
- # # Clean up the text by removing extra whitespace
521
- # cleaned_response = re.sub(r'\s+', ' ', response_text).strip()
522
-
523
- # # Format the final response with bullet points
524
- # final_response = f"""
525
- # Sure! Here is the response for your Query:
526
- # • Document name - {document_name}
527
- # • Page No - {page_number}
528
- # • Responses - {cleaned_response}
529
- # """
530
-
531
- # return final_response
532
-
533
-
534
- import re
535
-
536
- def clean_response(response_text):
537
- # Remove system and user tags
538
- response_text = re.sub(r'<\|system\|>.*?<\|end\|>', '', response_text, flags=re.DOTALL)
539
- response_text = re.sub(r'<\|user\|>.*?<\|end\|>', '', response_text, flags=re.DOTALL)
540
- response_text = re.sub(r'<\|assistant\|>', '', response_text, flags=re.DOTALL)
541
-
542
- # Extract the document name and page number from updated pattern
543
- document_match = re.search(r"Document\(metadata=\{'source': '(.+?)', 'page': (\d+)\}", response_text)
544
 
545
- if document_match:
546
- document_name = document_match.group(1).split('/')[-1] # Get the document name
547
- page_number = document_match.group(2) # Get the page number
548
- else:
549
- document_name = "Unknown"
550
- page_number = "Unknown"
551
-
552
- # Remove the entire 'Document(metadata=...' and any mention of it from the response
553
- response_text = re.sub(r'Document\(metadata=\{.*?\},page_content=', '', response_text, flags=re.DOTALL)
554
 
555
- # Remove any mention of "Document:" in the response
556
- response_text = re.sub(r'- Document:.*', '', response_text)
557
-
558
- # Remove any unwanted escape characters like \u and \u00
559
- response_text = re.sub(r'\\u[0-9A-Fa-f]{4}', '', response_text)
560
-
561
- # Ensure proper spacing between words and dates
562
- response_text = re.sub(r'([a-zA-Z])(\d)', r'\1 \2', response_text)
563
- response_text = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', response_text)
564
-
565
- # Remove the phrase "Sure! The Responses are as follows:" from the actual content
566
- response_text = re.sub(r'Sure! The Responses are as follows:', '', response_text).strip()
567
-
568
- # Clean up the text by removing extra whitespace
569
- cleaned_response = re.sub(r'\s+', ' ', response_text).strip()
570
-
571
- # Format the final response with bullet points
572
- final_response = f"""
573
- Sure! Here is the response for your Query:
574
- • Document name - {document_name}
575
- • Page No - {page_number}
576
- • Responses - {cleaned_response}
577
- """
578
-
579
- return final_response
580
-
581
-
582
-
583
-
584
-
585
-
586
-
587
-
588
-
589
-
590
- # Define a new template specifically for GPT-4o-mini in VDB Details mode
591
- gpt4o_mini_template_details = f"""
592
- As a highly specialized assistant, I provide precise, detailed, and informative responses. On this bright day of {current_date}, I'm equipped to assist with all your queries about Birmingham, Alabama, offering detailed insights tailored to your needs.
593
- Given your request, here is the detailed information you're seeking:
594
- {{context}}
595
- Question: {{question}}
596
- Detailed Answer:
597
- """
598
- #dataframe on gradio
599
- import pandas as pd
600
- # import ace_tools as tools # For displaying the DataFrame in Gradio
601
-
602
- import traceback
603
-
604
- def generate_answer(message, choice, retrieval_mode, selected_model):
605
- logging.debug(f"generate_answer called with choice: {choice}, retrieval_mode: {retrieval_mode}, and selected_model: {selected_model}")
606
-
607
- # Logic for disabling options for Phi-3.5
608
- if selected_model == "LM-2":
609
- choice = None
610
- retrieval_mode = None
611
-
612
- try:
613
- # Select the appropriate template based on the choice and model
614
- if choice == "Details" and selected_model == chat_model1: # GPT-4o-mini
615
- prompt_template = PromptTemplate(input_variables=["context", "question"], template=gpt4o_mini_template_details)
616
- elif choice == "Details":
617
- prompt_template = QA_CHAIN_PROMPT_1
618
- elif choice == "Conversational":
619
- prompt_template = QA_CHAIN_PROMPT_2
620
- else:
621
- prompt_template = QA_CHAIN_PROMPT_1 # Fallback to template1
622
-
623
- # # Handle hotel-related queries
624
- # if "hotel" in message.lower() or "hotels" in message.lower() and "birmingham" in message.lower():
625
- # logging.debug("Handling hotel-related query")
626
- # response = fetch_google_hotels()
627
- # logging.debug(f"Hotel response: {response}")
628
- # return response, extract_addresses(response)
629
-
630
- # # Handle restaurant-related queries
631
- # if "restaurant" in message.lower() or "restaurants" in message.lower() and "birmingham" in message.lower():
632
- # logging.debug("Handling restaurant-related query")
633
- # response = fetch_yelp_restaurants()
634
- # logging.debug(f"Restaurant response: {response}")
635
- # return response, extract_addresses(response)
636
-
637
- # # Handle flight-related queries
638
- # if "flight" in message.lower() or "flights" in message.lower() and "birmingham" in message.lower():
639
- # logging.debug("Handling flight-related query")
640
- # response = fetch_google_flights()
641
- # logging.debug(f"Flight response: {response}")
642
- # return response, extract_addresses(response)
643
-
644
- # Retrieval-based response
645
- if retrieval_mode == "VDB":
646
- logging.debug("Using VDB retrieval mode")
647
- if selected_model == chat_model:
648
- logging.debug("Selected model: LM-1")
649
- retriever = gpt_retriever
650
- context = retriever.get_relevant_documents(message)
651
- logging.debug(f"Retrieved context: {context}")
652
-
653
- prompt = prompt_template.format(context=context, question=message)
654
- logging.debug(f"Generated prompt: {prompt}")
655
-
656
- qa_chain = RetrievalQA.from_chain_type(
657
- llm=chat_model,
658
- chain_type="stuff",
659
- retriever=retriever,
660
- chain_type_kwargs={"prompt": prompt_template}
661
- )
662
- response = qa_chain({"query": message})
663
- logging.debug(f"LM-1 response: {response}")
664
- return response['result'], extract_addresses(response['result'])
665
-
666
- elif selected_model == chat_model1:
667
- logging.debug("Selected model: LM-3")
668
- retriever = gpt_retriever
669
- context = retriever.get_relevant_documents(message)
670
- logging.debug(f"Retrieved context: {context}")
671
-
672
- prompt = prompt_template.format(context=context, question=message)
673
- logging.debug(f"Generated prompt: {prompt}")
674
-
675
- qa_chain = RetrievalQA.from_chain_type(
676
- llm=chat_model1,
677
- chain_type="stuff",
678
- retriever=retriever,
679
- chain_type_kwargs={"prompt": prompt_template}
680
- )
681
- response = qa_chain({"query": message})
682
- logging.debug(f"LM-3 response: {response}")
683
- return response['result'], extract_addresses(response['result'])
684
-
685
-
686
-
687
- elif selected_model == phi_pipe:
688
- logging.debug("Selected model: LM-2")
689
- retriever = phi_retriever
690
- context_documents = retriever.get_relevant_documents(message)
691
- context = "\n".join([doc.page_content for doc in context_documents])
692
- logging.debug(f"Retrieved context for LM-2: {context}")
693
-
694
- # Use the correct template variable
695
- prompt = phi_custom_template.format(context=context, question=message)
696
- logging.debug(f"Generated LM-2 prompt: {prompt}")
697
-
698
- response = selected_model(prompt, **{
699
- "max_new_tokens": 250,
700
- "return_full_text": True,
701
- "temperature": 0.1,
702
- "do_sample": True,
703
- })
704
-
705
- if response:
706
- generated_text = response[0]['generated_text']
707
- logging.debug(f"LM-2 Response: {generated_text}")
708
- cleaned_response = clean_response(generated_text)
709
- return cleaned_response, extract_addresses(cleaned_response)
710
- else:
711
- logging.error("LM-2 did not return any response.")
712
- return "No response generated.", []
713
-
714
- elif retrieval_mode == "KGF":
715
- logging.debug("Using KGF retrieval mode")
716
- response = chain_neo4j.invoke({"question": message})
717
- logging.debug(f"KGF response: {response}")
718
- return response, extract_addresses(response)
719
- else:
720
- logging.error("Invalid retrieval mode selected.")
721
- return "Invalid retrieval mode selected.", []
722
-
723
- except Exception as e:
724
- logging.error(f"Error in generate_answer: {str(e)}")
725
- logging.error(traceback.format_exc())
726
- return "Sorry, I encountered an error while processing your request.", []
727
-
728
- # def generate_answer(message, choice, retrieval_mode, selected_model):
729
- # # Logic for Phi-3.5
730
- # if selected_model == phi_pipe: # LM-2 Phi-3.5 selected
731
- # retriever = phi_retriever
732
- # context_documents = retriever.get_relevant_documents(message)
733
- # context = "\n".join([doc.page_content for doc in context_documents])
734
-
735
- # # Use the correct template for Phi-3.5
736
- # prompt = phi_custom_template.format(context=context, question=message)
737
-
738
- # response = selected_model(prompt, **{
739
- # "max_new_tokens": 400,
740
- # "return_full_text": True,
741
- # "temperature": 0.7,
742
- # "do_sample": True,
743
- # })
744
-
745
- # if response:
746
- # generated_text = response[0]['generated_text']
747
- # cleaned_response = clean_response(generated_text)
748
- # # return cleaned_response, extract_addresses(cleaned_response)
749
- # return cleaned_response
750
- # else:
751
- # return "No response generated.", []
752
-
753
-
754
-
755
-
756
-
757
-
758
-
759
- def add_message(history, message):
760
- history.append((message, None))
761
- return history, gr.Textbox(value="", interactive=True, show_label=False)
762
-
763
-
764
-
765
- def print_like_dislike(x: gr.LikeData):
766
- print(x.index, x.value, x.liked)
767
-
768
- def extract_addresses(response):
769
- if not isinstance(response, str):
770
- response = str(response)
771
- address_patterns = [
772
- r'([A-Z].*,\sBirmingham,\sAL\s\d{5})',
773
- r'(\d{4}\s.*,\sBirmingham,\sAL\s\d{5})',
774
- r'([A-Z].*,\sAL\s\d{5})',
775
- r'([A-Z].*,.*\sSt,\sBirmingham,\sAL\s\d{5})',
776
- r'([A-Z].*,.*\sStreets,\sBirmingham,\sAL\s\d{5})',
777
- r'(\d{2}.*\sStreets)',
778
- r'([A-Z].*\s\d{2},\sBirmingham,\sAL\s\d{5})',
779
- r'([a-zA-Z]\s Birmingham)',
780
- r'([a-zA-Z].*,\sBirmingham,\sAL)',
781
- r'(.*),(Birmingham, AL,USA)$'
782
- r'(^Birmingham,AL$)',
783
- r'((.*)(Stadium|Field),.*,\sAL$)',
784
- r'((.*)(Stadium|Field),.*,\sFL$)',
785
- r'((.*)(Stadium|Field),.*,\sMS$)',
786
- r'((.*)(Stadium|Field),.*,\sAR$)',
787
- r'((.*)(Stadium|Field),.*,\sKY$)',
788
- r'((.*)(Stadium|Field),.*,\sTN$)',
789
- r'((.*)(Stadium|Field),.*,\sLA$)',
790
- r'((.*)(Stadium|Field),.*,\sFL$)'
791
-
792
- ]
793
- addresses = []
794
- for pattern in address_patterns:
795
- addresses.extend(re.findall(pattern, response))
796
- return addresses
797
-
798
- all_addresses = []
799
-
800
- def generate_map(location_names):
801
- global all_addresses
802
- all_addresses.extend(location_names)
803
-
804
- api_key = os.environ['GOOGLEMAPS_API_KEY']
805
- gmaps = GoogleMapsClient(key=api_key)
806
-
807
- m = folium.Map(location=[33.5175, -86.809444], zoom_start=12)
808
-
809
- for location_name in all_addresses:
810
- geocode_result = gmaps.geocode(location_name)
811
- if geocode_result:
812
- location = geocode_result[0]['geometry']['location']
813
- folium.Marker(
814
- [location['lat'], location['lng']],
815
- tooltip=f"{geocode_result[0]['formatted_address']}"
816
- ).add_to(m)
817
-
818
- map_html = m._repr_html_()
819
- return map_html
820
-
821
- from diffusers import DiffusionPipeline
822
- import torch
823
-
824
-
825
- def fetch_local_news():
826
- api_key = os.environ['SERP_API']
827
- url = f'https://serpapi.com/search.json?engine=google_news&q=birmingham headline&api_key={api_key}'
828
- response = requests.get(url)
829
- if response.status_code == 200:
830
- results = response.json().get("news_results", [])
831
- news_html = """
832
- <h2 style="font-family: 'Georgia', serif; color: #ff0000; background-color: #f8f8f8; padding: 10px; border-radius: 10px;">Birmingham Today</h2>
833
- <style>
834
- .news-item {
835
- font-family: 'Verdana', sans-serif;
836
- color: #333;
837
- background-color: #f0f8ff;
838
- margin-bottom: 15px;
839
- padding: 10px;
840
- border-radius: 5px;
841
- transition: box-shadow 0.3s ease, background-color 0.3s ease;
842
- font-weight: bold;
843
- }
844
- .news-item:hover {
845
- box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
846
- background-color: #e6f7ff;
847
- }
848
- .news-item a {
849
- color: #1E90FF;
850
- text-decoration: none;
851
- font-weight: bold;
852
- }
853
- .news-item a:hover {
854
- text-decoration: underline;
855
- }
856
- .news-preview {
857
- position: absolute;
858
- display: none;
859
- border: 1px solid #ccc;
860
- border-radius: 5px;
861
- box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
862
- background-color: white;
863
- z-index: 1000;
864
- max-width: 300px;
865
- padding: 10px;
866
- font-family: 'Verdana', sans-serif;
867
- color: #333;
868
- }
869
- </style>
870
- <script>
871
- function showPreview(event, previewContent) {
872
- var previewBox = document.getElementById('news-preview');
873
- previewBox.innerHTML = previewContent;
874
- previewBox.style.left = event.pageX + 'px';
875
- previewBox.style.top = event.pageY + 'px';
876
- previewBox.style.display = 'block';
877
- }
878
- function hidePreview() {
879
- var previewBox = document.getElementById('news-preview');
880
- previewBox.style.display = 'none';
881
- }
882
- </script>
883
- <div id="news-preview" class="news-preview"></div>
884
- """
885
- for index, result in enumerate(results[:7]):
886
- title = result.get("title", "No title")
887
- link = result.get("link", "#")
888
- snippet = result.get("snippet", "")
889
- news_html += f"""
890
- <div class="news-item" onmouseover="showPreview(event, '{snippet}')" onmouseout="hidePreview()">
891
- <a href='{link}' target='_blank'>{index + 1}. {title}</a>
892
- <p>{snippet}</p>
893
- </div>
894
- """
895
- return news_html
896
- else:
897
- return "<p>Failed to fetch local news</p>"
898
-
899
- import numpy as np
900
- import torch
901
- from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
902
-
903
- model_id = 'openai/whisper-large-v3'
904
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
905
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
906
- model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
907
- processor = AutoProcessor.from_pretrained(model_id)
908
-
909
- pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True)
910
-
911
- base_audio_drive = "/data/audio"
912
-
913
- #Normal Code with sample rate is 44100 Hz
914
-
915
- def transcribe_function(stream, new_chunk):
916
- try:
917
- sr, y = new_chunk[0], new_chunk[1]
918
- except TypeError:
919
- print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
920
- return stream, "", None
921
-
922
- y = y.astype(np.float32) / np.max(np.abs(y))
923
-
924
- if stream is not None:
925
- stream = np.concatenate([stream, y])
926
- else:
927
- stream = y
928
-
929
- result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
930
-
931
- full_text = result.get("text","")
932
-
933
- return stream, full_text, result
934
-
935
-
936
-
937
-
938
-
939
-
940
-
941
- # def update_map_with_response(history):
942
- # if not history:
943
- # return ""
944
- # response = history[-1][1]
945
- # addresses = extract_addresses(response)
946
- # return generate_map(addresses)
947
-
948
- def clear_textbox():
949
- return ""
950
-
951
- # def show_map_if_details(history, choice):
952
- # if choice in ["Details", "Conversational"]:
953
- # return gr.update(visible=True), update_map_with_response(history)
954
- # else:
955
- # return gr.update(visible(False), "")
956
-
957
-
958
-
959
-
960
-
961
-
962
-
963
-
964
- def generate_audio_elevenlabs(text):
965
- XI_API_KEY = os.environ['ELEVENLABS_API']
966
- VOICE_ID = 'd9MIrwLnvDeH7aZb61E9'
967
- tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
968
- headers = {
969
- "Accept": "application/json",
970
- "xi-api-key": XI_API_KEY
971
- }
972
- data = {
973
- "text": str(text),
974
- "model_id": "eleven_multilingual_v2",
975
- "voice_settings": {
976
- "stability": 1.0,
977
- "similarity_boost": 0.0,
978
- "style": 0.60,
979
- "use_speaker_boost": False
980
- }
981
- }
982
- response = requests.post(tts_url, headers=headers, json=data, stream=True)
983
- if response.ok:
984
- audio_segments = []
985
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
986
- for chunk in response.iter_content(chunk_size=1024):
987
- if chunk:
988
- f.write(chunk)
989
- audio_segments.append(chunk)
990
- temp_audio_path = f.name
991
-
992
- # Combine all audio chunks into a single file
993
- combined_audio = AudioSegment.from_file(temp_audio_path, format="mp3")
994
- combined_audio_path = os.path.join(tempfile.gettempdir(), "elevenlabs_combined_audio.mp3")
995
- combined_audio.export(combined_audio_path, format="mp3")
996
-
997
- logging.debug(f"Audio saved to {combined_audio_path}")
998
- return combined_audio_path
999
- else:
1000
- logging.error(f"Error generating audio: {response.text}")
1001
- return None
1002
-
1003
-
1004
-
1005
-
1006
- # chunking audio and then Process
1007
-
1008
- import concurrent.futures
1009
- import tempfile
1010
- import os
1011
- import numpy as np
1012
- import logging
1013
- from queue import Queue
1014
- from threading import Thread
1015
- from scipy.io.wavfile import write as write_wav
1016
- from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
1017
- from transformers import AutoTokenizer
1018
-
1019
- # Ensure your device is set to CUDA
1020
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
1021
-
1022
- repo_id = "parler-tts/parler-tts-mini-v1"
1023
-
1024
- def generate_audio_parler_tts(text):
1025
- description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
1026
- chunk_size_in_s = 0.5
1027
-
1028
- # Initialize the tokenizer and model
1029
- parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
1030
- parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
1031
- sampling_rate = parler_model.audio_encoder.config.sampling_rate
1032
- frame_rate = parler_model.audio_encoder.config.frame_rate
1033
-
1034
- def generate(text, description, play_steps_in_s=0.5):
1035
- play_steps = int(frame_rate * play_steps_in_s)
1036
- streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
1037
-
1038
- inputs = parler_tokenizer(description, return_tensors="pt").to(device)
1039
- prompt = parler_tokenizer(text, return_tensors="pt").to(device)
1040
-
1041
- generation_kwargs = dict(
1042
- input_ids=inputs.input_ids,
1043
- prompt_input_ids=prompt.input_ids,
1044
- attention_mask=inputs.attention_mask,
1045
- prompt_attention_mask=prompt.attention_mask,
1046
- streamer=streamer,
1047
- do_sample=True,
1048
- temperature=1.0,
1049
- min_new_tokens=10,
1050
- )
1051
-
1052
- thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
1053
- thread.start()
1054
-
1055
- for new_audio in streamer:
1056
- if new_audio.shape[0] == 0:
1057
- break
1058
- # Save or process each audio chunk as it is generated
1059
- yield sampling_rate, new_audio
1060
-
1061
- audio_segments = []
1062
- for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
1063
- audio_segments.append(audio_chunk)
1064
-
1065
- temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
1066
- write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
1067
- logging.debug(f"Saved chunk to {temp_audio_path}")
1068
-
1069
-
1070
- # Combine all the audio chunks into one audio file
1071
- combined_audio = np.concatenate(audio_segments)
1072
- combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
1073
-
1074
- write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
1075
-
1076
- logging.debug(f"Combined audio saved to {combined_audio_path}")
1077
- return combined_audio_path
1078
-
1079
-
1080
- def fetch_local_events():
1081
- api_key = os.environ['SERP_API']
1082
- url = f'https://serpapi.com/search.json?engine=google_events&q=Events+in+Birmingham&hl=en&gl=us&api_key={api_key}'
1083
- response = requests.get(url)
1084
- if response.status_code == 200:
1085
- events_results = response.json().get("events_results", [])
1086
- events_html = """
1087
- <h2 style="font-family: 'Georgia', serif; color: #ff0000; background-color: #f8f8f8; padding: 10px; border-radius: 10px;">Local Events</h2>
1088
- <style>
1089
- table {
1090
- font-family: 'Verdana', sans-serif;
1091
- color: #333;
1092
- border-collapse: collapse;
1093
- width: 100%;
1094
- }
1095
- th, td {
1096
- border: 1px solid #fff !important;
1097
- padding: 8px;
1098
- }
1099
- th {
1100
- background-color: #f2f2f2;
1101
- color: #333;
1102
- text-align: left;
1103
- }
1104
- tr:hover {
1105
- background-color: #f5f5f5;
1106
- }
1107
- .event-link {
1108
- color: #1E90FF;
1109
- text-decoration: none;
1110
- }
1111
- .event-link:hover {
1112
- text-decoration: underline;
1113
- }
1114
- </style>
1115
- <table>
1116
- <tr>
1117
- <th>Title</th>
1118
- <th>Date and Time</th>
1119
- <th>Location</th>
1120
- </tr>
1121
- """
1122
- for event in events_results:
1123
- title = event.get("title", "No title")
1124
- date_info = event.get("date", {})
1125
- date = f"{date_info.get('start_date', '')} {date_info.get('when', '')}".replace("{", "").replace("}", "")
1126
- location = event.get("address", "No location")
1127
- if isinstance(location, list):
1128
- location = " ".join(location)
1129
- location = location.replace("[", "").replace("]", "")
1130
- link = event.get("link", "#")
1131
- events_html += f"""
1132
- <tr>
1133
- <td><a class='event-link' href='{link}' target='_blank'>{title}</a></td>
1134
- <td>{date}</td>
1135
- <td>{location}</td>
1136
- </tr>
1137
- """
1138
- events_html += "</table>"
1139
- return events_html
1140
- else:
1141
- return "<p>Failed to fetch local events</p>"
1142
-
1143
- def get_weather_icon(condition):
1144
- condition_map = {
1145
- "Clear": "c01d",
1146
- "Partly Cloudy": "c02d",
1147
- "Cloudy": "c03d",
1148
- "Overcast": "c04d",
1149
- "Mist": "a01d",
1150
- "Patchy rain possible": "r01d",
1151
- "Light rain": "r02d",
1152
- "Moderate rain": "r03d",
1153
- "Heavy rain": "r04d",
1154
- "Snow": "s01d",
1155
- "Thunderstorm": "t01d",
1156
- "Fog": "a05d",
1157
  }
1158
- return condition_map.get(condition, "c04d")
1159
-
1160
- def fetch_local_weather():
1161
- try:
1162
- api_key = os.environ['WEATHER_API']
1163
- url = f'https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/birmingham?unitGroup=metric&include=events%2Calerts%2Chours%2Cdays%2Ccurrent&key={api_key}'
1164
- response = requests.get(url)
1165
- response.raise_for_status()
1166
- jsonData = response.json()
1167
-
1168
- current_conditions = jsonData.get("currentConditions", {})
1169
- temp_celsius = current_conditions.get("temp", "N/A")
1170
-
1171
- if temp_celsius != "N/A":
1172
- temp_fahrenheit = int((temp_celsius * 9/5) + 32)
1173
- else:
1174
- temp_fahrenheit = "N/A"
1175
-
1176
- condition = current_conditions.get("conditions", "N/A")
1177
- humidity = current_conditions.get("humidity", "N/A")
1178
-
1179
- weather_html = f"""
1180
- <div class="weather-theme">
1181
- <h2 style="font-family: 'Georgia', serif; color: #ff0000; background-color: #f8f8f8; padding: 10px; border-radius: 10px;">Local Weather</h2>
1182
- <div class="weather-content">
1183
- <div class="weather-icon">
1184
- <img src="https://www.weatherbit.io/static/img/icons/{get_weather_icon(condition)}.png" alt="{condition}" style="width: 100px; height: 100px;">
1185
- </div>
1186
- <div class="weather-details">
1187
- <p style="font-family: 'Verdana', sans-serif; color: #333; font-size: 1.2em;">Temperature: {temp_fahrenheit}°F</p>
1188
- <p style="font-family: 'Verdana', sans-serif; color: #333; font-size: 1.2em;">Condition: {condition}</p>
1189
- <p style="font-family: 'Verdana', sans-serif; color: #333; font-size: 1.2em;">Humidity: {humidity}%</p>
1190
- </div>
1191
- </div>
1192
- </div>
1193
- <style>
1194
- .weather-theme {{
1195
- animation: backgroundAnimation 10s infinite alternate;
1196
- border-radius: 10px;
1197
- padding: 10px;
1198
- margin-bottom: 15px;
1199
- background: linear-gradient(45deg, #ffcc33, #ff6666, #ffcc33, #ff6666);
1200
- background-size: 400% 400%;
1201
- box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
1202
- transition: box-shadow 0.3s ease, background-color 0.3s ease;
1203
- }}
1204
- .weather-theme:hover {{
1205
- box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2);
1206
- background-position: 100% 100%;
1207
- }}
1208
- @keyframes backgroundAnimation {{
1209
- 0% {{ background-position: 0% 50%; }}
1210
- 100% {{ background-position: 100% 50%; }}
1211
- }}
1212
- .weather-content {{
1213
- display: flex;
1214
- align-items: center;
1215
- }}
1216
- .weather-icon {{
1217
- flex: 1;
1218
- }}
1219
- .weather-details {{
1220
- flex 3;
1221
- }}
1222
- </style>
1223
- """
1224
- return weather_html
1225
- except requests.exceptions.RequestException as e:
1226
- return f"<p>Failed to fetch local weather: {e}</p>"
1227
-
1228
-
1229
- def handle_retrieval_mode_change(choice):
1230
- if choice == "KGF":
1231
- return gr.update(interactive=False), gr.update(interactive=False)
1232
- else:
1233
- return gr.update(interactive=True), gr.update(interactive=True)
1234
-
1235
-
1236
-
1237
- def handle_model_choice_change(selected_model):
1238
- if selected_model == "LM-2":
1239
- # Disable retrieval mode and select style when LM-2 is selected
1240
- return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
1241
- elif selected_model == "LM-1":
1242
- # Enable retrieval mode and select style for LM-1
1243
- return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)
1244
- else:
1245
- # Default case: allow interaction
1246
- return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)
1247
-
1248
- # def handle_model_choice_change(selected_model):
1249
- # if selected_model == "LM-2": # When LM-2 (Phi-3.5) is selected
1250
- # # Disable retrieval mode and select style when LM-2 is selected
1251
- # return (
1252
- # gr.update(interactive=False), # Disable retrieval mode
1253
- # gr.update(interactive=False), # Disable style (Details/Conversational)
1254
- # gr.update(interactive=False) # Disable the model choice itself
1255
- # )
1256
- # else:
1257
- # # Disable GPT-4o, GPT-4o-mini, and KGF, only Phi-3.5 works
1258
- # return (
1259
- # gr.update(interactive=True), # Allow retrieval mode for other models
1260
- # gr.update(interactive=True), # Allow style options for other models
1261
- # gr.update(interactive=True) # Allow other models to be selected
1262
- # )
1263
-
1264
- #Flux Coding
1265
-
1266
-
1267
- # Existing prompts for the Flux API
1268
- hardcoded_prompt_1 = "A high quality cinematic image for Toyota Truck in Birmingham skyline shot in the style of Michael Mann"
1269
- hardcoded_prompt_2 = "A high quality cinematic image for Alabama Quarterback close up emotional shot in the style of Michael Mann"
1270
- hardcoded_prompt_3 = "A high quality cinematic image for Taylor Swift concert in Birmingham skyline style of Michael Mann"
1271
-
1272
- # Function to call the Flux API and generate images
1273
- def generate_image_flux(prompt):
1274
- # client = Client("black-forest-labs/FLUX.1-schnell",hf_token=hf_token)
1275
- client = Client("Pijush2023/radar_flux")
1276
- result = client.predict(
1277
- prompt=prompt,
1278
- seed=0,
1279
- randomize_seed=True,
1280
- width=400,
1281
- height=400,
1282
- num_inference_steps=2,
1283
- api_name="/infer"
1284
- )
1285
 
1286
- # Assuming that the API response contains an image file or URL, extract the image part
1287
- if isinstance(result, tuple):
1288
- # Extract the image URL or path if it is a tuple
1289
- image_path_or_url = result[0] # Adjust this index based on the actual structure of the response
1290
- else:
1291
- image_path_or_url = result
1292
 
1293
- return image_path_or_url # Return the image path or URL directly
1294
-
1295
- # Function to update images with the three prompts
1296
- def update_images():
1297
- image_1 = generate_image_flux(hardcoded_prompt_1)
1298
- image_2 = generate_image_flux(hardcoded_prompt_2)
1299
- image_3 = generate_image_flux(hardcoded_prompt_3)
1300
- return image_1, image_2, image_3
1301
-
1302
-
1303
-
1304
-
1305
-
1306
- def format_restaurant_hotel_info(name, link, location, phone, rating, reviews, snippet):
1307
- return f"""
1308
- {name}
1309
- - Link: {link}
1310
- - Location: {location}
1311
- - Contact No: {phone}
1312
- - Rating: {rating} stars ({reviews} reviews)
1313
- - Snippet: {snippet}
1314
- """
1315
-
1316
- def fetch_yelp_restaurants():
1317
- # Introductory prompt for restaurants
1318
- intro_prompt = "Here are some of the top-rated restaurants in Birmingham, Alabama. I hope these suggestions help you find the perfect place to enjoy your meal:"
1319
-
1320
- params = {
1321
- "engine": "yelp",
1322
- "find_desc": "Restaurant",
1323
- "find_loc": "Birmingham, AL, USA",
1324
- "api_key": os.getenv("SERP_API")
1325
- }
1326
-
1327
- search = GoogleSearch(params)
1328
- results = search.get_dict()
1329
- organic_results = results.get("organic_results", [])
1330
-
1331
- response_text = f"{intro_prompt}\n"
1332
-
1333
- for result in organic_results[:5]: # Limiting to top 5 restaurants
1334
- name = result.get("title", "No name")
1335
- rating = result.get("rating", "No rating")
1336
- reviews = result.get("reviews", "No reviews")
1337
- phone = result.get("phone", "Not Available")
1338
- snippet = result.get("snippet", "Not Available")
1339
- location = f"{name}, Birmingham, AL,USA"
1340
- link = result.get("link", "#")
1341
-
1342
- response_text += format_restaurant_hotel_info(name, link, location, phone, rating, reviews, snippet)
1343
-
1344
-
1345
- return response_text
1346
-
1347
 
1348
-
1349
-
1350
-
1351
-
1352
- def format_hotel_info(name, link, location, rate_per_night, total_rate, description, check_in_time, check_out_time, amenities):
1353
- return f"""
1354
- {name}
1355
- - Link: {link}
1356
- - Location: {location}
1357
- - Rate per Night: {rate_per_night} (Before taxes/fees: {total_rate})
1358
- - Check-in Time: {check_in_time}
1359
- - Check-out Time: {check_out_time}
1360
- - Amenities: {amenities}
1361
- - Description: {description}
1362
- """
1363
-
1364
- def fetch_google_hotels(query="Birmingham Hotel", check_in=current_date1, check_out="2024-09-02", adults=2):
1365
- # Introductory prompt for hotels
1366
- intro_prompt = "Here are some of the best hotels in Birmingham, Alabama, for your stay. Each of these options offers a unique experience, whether you're looking for luxury, comfort, or convenience:"
1367
-
1368
- params = {
1369
- "engine": "google_hotels",
1370
- "q": query,
1371
- "check_in_date": check_in,
1372
- "check_out_date": check_out,
1373
- "adults": str(adults),
1374
- "currency": "USD",
1375
- "gl": "us",
1376
- "hl": "en",
1377
- "api_key": os.getenv("SERP_API")
1378
- }
1379
-
1380
- search = GoogleSearch(params)
1381
- results = search.get_dict()
1382
- hotel_results = results.get("properties", [])
1383
-
1384
- hotel_info = f"{intro_prompt}\n"
1385
- for hotel in hotel_results[:5]: # Limiting to top 5 hotels
1386
- name = hotel.get('name', 'No name')
1387
- description = hotel.get('description', 'No description')
1388
- link = hotel.get('link', '#')
1389
- check_in_time = hotel.get('check_in_time', 'N/A')
1390
- check_out_time = hotel.get('check_out_time', 'N/A')
1391
- rate_per_night = hotel.get('rate_per_night', {}).get('lowest', 'N/A')
1392
- before_taxes_fees = hotel.get('rate_per_night', {}).get('before_taxes_fees', 'N/A')
1393
- total_rate = hotel.get('total_rate', {}).get('lowest', 'N/A')
1394
- amenities = ", ".join(hotel.get('amenities', [])) if hotel.get('amenities') else "Not Available"
1395
-
1396
- location = f"{name}, Birmingham, AL,USA"
1397
-
1398
- hotel_info += format_hotel_info(
1399
- name,
1400
- link,
1401
- location,
1402
- rate_per_night,
1403
- total_rate,
1404
- description,
1405
- check_in_time,
1406
- check_out_time,
1407
- amenities
1408
- )
1409
-
1410
-
1411
- return hotel_info
1412
-
1413
-
1414
-
1415
-
1416
- def format_flight_info(flight_number, departure_airport, departure_time, arrival_airport, arrival_time, duration, airplane):
1417
- return f"""
1418
- Flight {flight_number}
1419
- - Departure: {departure_airport} at {departure_time}
1420
- - Arrival: {arrival_airport} at {arrival_time}
1421
- - Duration: {duration} minutes
1422
- - Airplane: {airplane}
1423
- """
1424
-
1425
- def fetch_google_flights(departure_id="JFK", arrival_id="BHM", outbound_date=current_date1, return_date="2024-08-20"):
1426
- # Introductory prompt for flights
1427
- intro_prompt = "Here are some available flights from JFK to Birmingham, Alabama. These options provide a range of times and durations to fit your travel needs:"
1428
-
1429
- params = {
1430
- "engine": "google_flights",
1431
- "departure_id": departure_id,
1432
- "arrival_id": arrival_id,
1433
- "outbound_date": outbound_date,
1434
- "return_date": return_date,
1435
- "currency": "USD",
1436
- "hl": "en",
1437
- "api_key": os.getenv("SERP_API")
1438
- }
1439
-
1440
- search = GoogleSearch(params)
1441
- results = search.get_dict()
1442
-
1443
- # Extract flight details from the results
1444
- best_flights = results.get('best_flights', [])
1445
- flight_info = f"{intro_prompt}\n"
1446
-
1447
- # Process each flight in the best_flights list
1448
- for i, flight in enumerate(best_flights, start=1):
1449
- for segment in flight.get('flights', []):
1450
- departure_airport = segment.get('departure_airport', {}).get('name', 'Unknown Departure Airport')
1451
- departure_time = segment.get('departure_airport', {}).get('time', 'Unknown Time')
1452
- arrival_airport = segment.get('arrival_airport', {}).get('name', 'Unknown Arrival Airport')
1453
- arrival_time = segment.get('arrival_airport', {}).get('time', 'Unknown Time')
1454
- duration = segment.get('duration', 'Unknown Duration')
1455
- airplane = segment.get('airplane', 'Unknown Airplane')
1456
-
1457
- # Format the flight segment details
1458
- flight_info += format_flight_info(
1459
- flight_number=i,
1460
- departure_airport=departure_airport,
1461
- departure_time=departure_time,
1462
- arrival_airport=arrival_airport,
1463
- arrival_time=arrival_time,
1464
- duration=duration,
1465
- airplane=airplane
1466
- )
1467
-
1468
-
1469
- return flight_info
1470
-
1471
-
1472
- # examples = [
1473
- # [
1474
- # "What are the concerts in Birmingham?",
1475
- # ],
1476
- # [
1477
- # "what are some of the upcoming matches of crimson tide?",
1478
- # ],
1479
- # [
1480
- # "where from i will get a Hamburger?",
1481
- # ],
1482
- # [
1483
- # "What are some of the hotels at birmingham?",
1484
- # ],
1485
- # [
1486
- # "how can i connect the alexa to the radio?"
1487
- # ],
1488
- # [
1489
- # "What are some of the good clubs at birmingham?"
1490
- # ],
1491
- # [
1492
- # "How do I call the radio station?",
1493
- # ],
1494
- # [
1495
- # "What’s the spread?"
1496
- # ],
1497
- # [
1498
- # "What time is Crimson Tide Rewind?"
1499
- # ],
1500
- # [
1501
- # "What time is Alabama kick-off?"
1502
- # ],
1503
- # [
1504
- # "who are some of the popular players of crimson tide?"
1505
- # ]
1506
- # ]
1507
-
1508
- # # Function to insert the prompt into the textbox when clicked
1509
- # def insert_prompt(current_text, prompt):
1510
- # return prompt[0] if prompt else current_text
1511
-
1512
- # Create a global list to store uploaded document records
1513
- uploaded_documents = []
1514
- from datetime import datetime
1515
-
1516
-
1517
- from langchain_core.documents import Document
1518
- # Function to process PDF, extract text, split it into chunks, and upload to the vector DB
1519
- def process_pdf(pdf_file,uploaded_documents):
1520
- if pdf_file is None:
1521
- return uploaded_documents, "No PDF file uploaded."
1522
- with pdfplumber.open(pdf_file) as pdf:
1523
- all_text = ""
1524
- for page in pdf.pages:
1525
- all_text += page.extract_text()
1526
-
1527
- # Split the text into chunks
1528
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
1529
- chunks = text_splitter.split_text(all_text)
1530
-
1531
- # Embed and upload the chunks into the vector database
1532
- chunk_ids = []
1533
- for chunk in chunks:
1534
- document = Document(page_content=chunk)
1535
- chunk_id = vectorstore.add_documents([document])
1536
- chunk_ids.append(chunk_id)
1537
-
1538
- # Update the upload history
1539
- document_record = {
1540
- "Document Name": pdf_file.name,
1541
- "Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
1542
- "Chunks": len(chunks),
1543
- "Pinecone Index": index_name
1544
- }
1545
 
1546
- # Add the record to the global list
1547
- uploaded_documents.append(document_record)
1548
-
1549
- # Convert the list of dictionaries into a list of lists for the dataframe
1550
- table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
1551
-
1552
- return table_data, f"Uploaded {len(chunks)} chunks to the vector database."
1553
-
1554
-
1555
-
1556
-
1557
-
1558
-
1559
-
1560
-
1561
-
1562
- with gr.Blocks(theme='gradio/soft') as demo:
1563
-
1564
-
1565
  with gr.Row():
1566
  with gr.Column():
1567
- state = gr.State()
1568
-
1569
- chatbot = gr.Chatbot([], elem_id="RADAR:Channel 94.1", bubble_full_width=False)
1570
- choice = gr.Radio(label="Select Style", choices=["Details", "Conversational"], value="Conversational",interactive=False,visible=False)
1571
- retrieval_mode = gr.Radio(label="Retrieval Mode", choices=["VDB", "KGF"], value="VDB",interactive=False,visible=False)
1572
- model_choice = gr.Dropdown(label="Choose Model", choices=["LM-1"], value="LM-1")
1573
-
1574
- # Link the dropdown change to handle_model_choice_change
1575
- model_choice.change(fn=handle_model_choice_change, inputs=model_choice, outputs=[retrieval_mode, choice, choice])
1576
-
1577
- # gr.Markdown("<h1 style='color: red;'>Talk to RADAR</h1>", elem_id="voice-markdown")
1578
-
1579
- chat_input = gr.Textbox(show_copy_button=True, interactive=True, show_label=False, label="ASK Radar !!!")
1580
- tts_choice = gr.Radio(label="Select TTS System", choices=["Alpha", "Beta"], value="Alpha",interactive=False,visible=False)
1581
 
1582
- retriever_button = gr.Button("Retriever")
1583
-
1584
- clear_button = gr.Button("Clear")
1585
- clear_button.click(lambda: [None, None], outputs=[chat_input, state])
1586
-
1587
- # gr.Markdown("<h1 style='color: red;'>Radar Map</h1>", elem_id="Map-Radar")
1588
- # location_output = gr.HTML()
1589
- # audio_output = gr.Audio(interactive=False, autoplay=True)
1590
-
1591
- def stop_audio():
1592
- audio_output.stop()
1593
- return None
1594
-
1595
-
1596
-
1597
-
1598
-
1599
- # retriever_sequence = (
1600
- # retriever_button.click(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording")
1601
- # .then(fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory")
1602
- # # First, generate the bot response
1603
- # .then(fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response")
1604
- # # Then, generate the TTS response based on the bot's response
1605
- # .then(fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response")
1606
- # .then(fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox")
1607
- # )
1608
-
1609
- retriever_sequence = (
1610
- retriever_button.click(fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory")
1611
- # First, generate the bot response
1612
- .then(fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response")
1613
- .then(fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox")
1614
- )
1615
-
1616
-
1617
-
1618
-
1619
-
1620
-
1621
-
1622
-
1623
-
1624
- # chat_input.submit(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording").then(
1625
- # fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory"
1626
- # ).then(
1627
- # # First, generate the bot response
1628
- # fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response"
1629
- # ).then(
1630
- # # Then, generate the TTS response based on the bot's response
1631
- # fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response"
1632
- # ).then(
1633
- # fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox"
1634
- # )
1635
-
1636
- chat_input.submit(
1637
- fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory"
1638
- ).then(
1639
- # First, generate the bot response
1640
- fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response"
1641
- ).then(
1642
- fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox"
1643
- )
1644
-
1645
-
1646
-
1647
-
1648
-
1649
-
1650
-
1651
-
1652
- # audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', every=0.1)
1653
- # audio_input.stream(transcribe_function, inputs=[state, audio_input], outputs=[state, chat_input], api_name="api_voice_to_text",visible=False)
1654
-
1655
- # gr.Markdown("<h1 style='color: red;'>Example Prompts</h1>", elem_id="Example-Prompts")
1656
- # gr.Examples(examples=examples, fn=insert_prompt,inputs=chat_input, outputs=chat_input)
1657
-
1658
- # with gr.Column():
1659
- # weather_output = gr.HTML(value=fetch_local_weather())
1660
- # news_output = gr.HTML(value=fetch_local_news())
1661
- # events_output = gr.HTML(value=fetch_local_events())
1662
-
1663
- # with gr.Column():
1664
-
1665
-
1666
- # # Call update_images during the initial load to display images when the interface appears
1667
- # initial_images = update_images()
1668
-
1669
- # # Displaying the images generated using Flux API directly
1670
- # image_output_1 = gr.Image(value=initial_images[0], label="Image 1", elem_id="flux_image_1", width=400, height=400)
1671
- # image_output_2 = gr.Image(value=initial_images[1], label="Image 2", elem_id="flux_image_2", width=400, height=400)
1672
- # image_output_3 = gr.Image(value=initial_images[2], label="Image 3", elem_id="flux_image_3", width=400, height=400)
1673
-
1674
- # # Refresh button to update images
1675
- # refresh_button = gr.Button("Refresh Images")
1676
- # refresh_button.click(fn=update_images, inputs=None, outputs=[image_output_1, image_output_2, image_output_3])
1677
-
1678
- # File upload component
1679
  with gr.Column():
1680
- file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
1681
- # Button to trigger processing
1682
- process_button = gr.Button("Process PDF and Upload")
1683
-
1684
- # Dataframe to display uploaded document records
1685
- document_table = gr.Dataframe(headers=["Document Name", "Upload Time", "Chunks", "Pinecone Index"], interactive=False)
1686
-
1687
-
1688
-
1689
- # Output textbox for results
1690
- output_textbox = gr.Textbox(label="Result")
1691
-
1692
- # Define button click action
1693
- # process_button.click(fn=process_pdf, inputs=file_input, outputs=output_textbox)
1694
- process_button.click(fn=process_pdf, inputs=[file_input, gr.State([])], outputs=[document_table, output_textbox])
1695
-
1696
-
1697
-
1698
-
1699
-
1700
-
1701
- demo.queue()
1702
- demo.launch(show_error=True)
1703
-
1704
-
1705
-
1706
-
1707
 
 
 
1708
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import pdfplumber
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import os
4
+ from langchain.document_loaders import Document
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain.embeddings.openai import OpenAIEmbeddings
7
+ from langchain.vectorstores import Pinecone
8
+ import pinecone
9
+ import pandas as pd
10
+ import time
11
+ from pinecone.grpc import PineconeGRPC as Pinecone
12
+ from pinecone import ServerlessSpec
13
+
14
+ # OpenAI API key
15
+ openai_api_key = os.getenv("OPENAI_API_KEY")
16
+
17
+ # Initialize Pinecone with PineconeGRPC
18
+ pinecone_api_key = os.getenv("PINECONE_API_KEY")
19
+ pc = Pinecone(api_key=pinecone_api_key)
20
+
21
+ # Define index name and parameters
22
+ index_name = "italy-kg"
23
+
24
+ # Create index if it doesn't exist
25
+ if index_name not in pinecone.list_indexes():
26
+ pc.create_index(
27
+ name=index_name,
28
+ dimension=1536,
29
+ metric="cosine",
30
+ spec=ServerlessSpec(
31
+ cloud="aws",
32
+ region="us-east-1"
33
+ ),
34
+ deletion_protection="disabled"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ # Embedding using OpenAI
38
+ embeddings = OpenAIEmbeddings(api_key=openai_api_key)
 
 
 
 
39
 
40
+ # Gradio Blocks app with PDF uploader and table for logs
41
+ def process_pdf(file):
42
+ # Extract text from PDF using pdfplumber
43
+ with pdfplumber.open(file.name) as pdf:
44
+ text = ""
45
+ for page in pdf.pages:
46
+ text += page.extract_text()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ # Split text using RecursiveCharacterTextSplitter
49
+ documents = [Document(page_content=text)]
50
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
51
+ docs = text_splitter.split_documents(documents)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ # Add documents to Pinecone Vector Store
54
+ vectorstore = Pinecone.from_existing_index(index_name, embeddings)
55
+ vectorstore.add_documents(docs)
 
 
 
 
 
 
56
 
57
+ # Prepare log data
58
+ log_data = {
59
+ "File Name": [file.name],
60
+ "File Size (KB)": [os.path.getsize(file.name) / 1024],
61
+ "Number of Chunks": [len(docs)],
62
+ "Timestamp": [time.strftime("%Y-%m-%d %H:%M:%S")]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ # Create a DataFrame for logs
66
+ df_logs = pd.DataFrame(log_data)
 
 
 
 
67
 
68
+ return "PDF processed successfully!", df_logs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ # Gradio Interface
71
+ with gr.Blocks() as demo:
72
+ gr.Markdown("# PDF Uploader to Pinecone with Logs")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  with gr.Row():
75
  with gr.Column():
76
+ pdf_input = gr.File(label="Upload PDF", type="file")
77
+ process_button = gr.Button("Process PDF")
 
 
 
 
 
 
 
 
 
 
 
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  with gr.Column():
80
+ output_text = gr.Textbox(label="Status", interactive=False)
81
+ log_table = gr.DataFrame(label="Logs", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ # Define action on button click
84
+ process_button.click(process_pdf, inputs=pdf_input, outputs=[output_text, log_table])
85
 
86
+ # Launch the Gradio app
87
+ demo.launch()