johannoriel commited on
Commit
83a9d0d
1 Parent(s): b0196ae

Update app.py

Browse files

upload cache and work without file

Files changed (1) hide show
  1. app.py +33 -11
app.py CHANGED
@@ -5,12 +5,16 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.embeddings import HuggingFaceEmbeddings
7
  import fitz # PyMuPDF
 
 
 
 
 
 
8
 
9
- # Function to get available models from Hugging Face
10
  def get_hf_models():
11
  return ["Qwen/Qwen2.5-3B-Instruct", "HuggingFaceH4/zephyr-7b-beta", "mistralai/Mistral-7B-Instruct-v0.1"]
12
 
13
- # Function to extract text from a PDF
14
  def extract_text_from_pdf(pdf_path):
15
  text = ""
16
  with fitz.open(pdf_path) as doc:
@@ -18,13 +22,11 @@ def extract_text_from_pdf(pdf_path):
18
  text += page.get_text()
19
  return text
20
 
21
- # Function for manual RAG
22
  def manual_rag(query, context, client):
23
  prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
24
  response = client.text_generation(prompt, max_new_tokens=512)
25
  return response
26
 
27
- # Function for classic RAG
28
  def classic_rag(query, pdf_path, client, embedder):
29
  text = extract_text_from_pdf(pdf_path)
30
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
@@ -36,26 +38,47 @@ def classic_rag(query, pdf_path, client, embedder):
36
  response = manual_rag(query, context, client)
37
  return response, context
38
 
39
- # Function for response without RAG
40
  def no_rag(query, client):
41
  response = client.text_generation(query, max_new_tokens=512)
42
  return response
43
 
44
- # Gradio interface function
45
- def process_query(query, pdf_path, llm_choice, embedder_choice):
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  client = InferenceClient(llm_choice)
47
- full_text = extract_text_from_pdf(pdf_path)
48
  no_rag_response = no_rag(query, client)
 
 
 
 
 
 
 
 
 
49
  manual_rag_response = manual_rag(query, full_text, client)
50
  classic_rag_response, classic_rag_context = classic_rag(query, pdf_path, client, embedder_choice)
 
51
  return no_rag_response, manual_rag_response, classic_rag_response, full_text, classic_rag_context
52
 
53
- # Create Gradio interface
54
  iface = gr.Interface(
55
  fn=process_query,
56
  inputs=[
57
  gr.Textbox(label="Votre question"),
58
- gr.File(label="Chargez votre PDF"),
 
59
  gr.Dropdown(choices=get_hf_models(), label="Choisissez le LLM", value="Qwen/Qwen2.5-3B-Instruct"),
60
  gr.Dropdown(choices=["sentence-transformers/all-MiniLM-L6-v2", "nomic-ai/nomic-embed-text-v1.5"],
61
  label="Choisissez l'Embedder", value="sentence-transformers/all-MiniLM-L6-v2")
@@ -72,6 +95,5 @@ iface = gr.Interface(
72
  theme="default"
73
  )
74
 
75
- # Launch the application
76
  if __name__ == "__main__":
77
  iface.launch()
 
5
  from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.embeddings import HuggingFaceEmbeddings
7
  import fitz # PyMuPDF
8
+ import os
9
+ import hashlib
10
+
11
+ # Directory to store cached files
12
+ CACHE_DIR = "pdf_cache"
13
+ os.makedirs(CACHE_DIR, exist_ok=True)
14
 
 
15
  def get_hf_models():
16
  return ["Qwen/Qwen2.5-3B-Instruct", "HuggingFaceH4/zephyr-7b-beta", "mistralai/Mistral-7B-Instruct-v0.1"]
17
 
 
18
  def extract_text_from_pdf(pdf_path):
19
  text = ""
20
  with fitz.open(pdf_path) as doc:
 
22
  text += page.get_text()
23
  return text
24
 
 
25
  def manual_rag(query, context, client):
26
  prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
27
  response = client.text_generation(prompt, max_new_tokens=512)
28
  return response
29
 
 
30
  def classic_rag(query, pdf_path, client, embedder):
31
  text = extract_text_from_pdf(pdf_path)
32
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
 
38
  response = manual_rag(query, context, client)
39
  return response, context
40
 
 
41
  def no_rag(query, client):
42
  response = client.text_generation(query, max_new_tokens=512)
43
  return response
44
 
45
+ def cache_file(file):
46
+ if file is None:
47
+ return None
48
+ file_hash = hashlib.md5(file.read()).hexdigest()
49
+ cached_path = os.path.join(CACHE_DIR, f"{file_hash}.pdf")
50
+ if not os.path.exists(cached_path):
51
+ with open(cached_path, "wb") as f:
52
+ file.seek(0)
53
+ f.write(file.read())
54
+ return cached_path
55
+
56
+ def get_cached_files():
57
+ return [f for f in os.listdir(CACHE_DIR) if f.endswith('.pdf')]
58
+
59
+ def process_query(query, pdf_file, cached_file, llm_choice, embedder_choice):
60
  client = InferenceClient(llm_choice)
 
61
  no_rag_response = no_rag(query, client)
62
+
63
+ if pdf_file is not None:
64
+ pdf_path = cache_file(pdf_file)
65
+ elif cached_file:
66
+ pdf_path = os.path.join(CACHE_DIR, cached_file)
67
+ else:
68
+ return no_rag_response, "RAG non utilisé (pas de fichier PDF)", "RAG non utilisé (pas de fichier PDF)", "Pas de fichier PDF fourni", "Pas de contexte extrait"
69
+
70
+ full_text = extract_text_from_pdf(pdf_path)
71
  manual_rag_response = manual_rag(query, full_text, client)
72
  classic_rag_response, classic_rag_context = classic_rag(query, pdf_path, client, embedder_choice)
73
+
74
  return no_rag_response, manual_rag_response, classic_rag_response, full_text, classic_rag_context
75
 
 
76
  iface = gr.Interface(
77
  fn=process_query,
78
  inputs=[
79
  gr.Textbox(label="Votre question"),
80
+ gr.File(label="Chargez un nouveau PDF"),
81
+ gr.Dropdown(choices=get_cached_files, label="Ou choisissez un PDF déjà téléversé", interactive=True),
82
  gr.Dropdown(choices=get_hf_models(), label="Choisissez le LLM", value="Qwen/Qwen2.5-3B-Instruct"),
83
  gr.Dropdown(choices=["sentence-transformers/all-MiniLM-L6-v2", "nomic-ai/nomic-embed-text-v1.5"],
84
  label="Choisissez l'Embedder", value="sentence-transformers/all-MiniLM-L6-v2")
 
95
  theme="default"
96
  )
97
 
 
98
  if __name__ == "__main__":
99
  iface.launch()