Spaces:
Sleeping
Sleeping
Commit
·
7211b51
1
Parent(s):
bbb7270
Enhance .gitignore and improve app.py functionality. Added additional file types to .gitignore for better exclusion. Updated app.py to manage upload progress with Streamlit session state, improved UI layout, and added documentation for user guidance.
Browse files- .gitignore +9 -1
- app.py +44 -63
- libgen_scraper.py +482 -0
- links.json +17 -0
- pages/1_Upload.py +34 -0
- pages/2_Chat.py +42 -0
.gitignore
CHANGED
@@ -1 +1,9 @@
|
|
1 |
-
.env
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
downloads/
|
3 |
+
pdfs/
|
4 |
+
__pycache__/
|
5 |
+
*.pyc
|
6 |
+
*.pyo
|
7 |
+
*.pyd
|
8 |
+
*.pyw
|
9 |
+
*.pyz
|
app.py
CHANGED
@@ -57,17 +57,33 @@ def process_upload(upload_type, file_or_link, file_name=None):
|
|
57 |
chunks = [content[i:i+chunk_size] for i in range(0, content_length, chunk_size)]
|
58 |
|
59 |
vectors = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
with ThreadPoolExecutor() as executor:
|
61 |
-
futures =
|
62 |
|
|
|
63 |
for future in as_completed(futures):
|
64 |
vectors.append(future.result())
|
65 |
-
|
66 |
-
progress
|
67 |
-
|
|
|
|
|
|
|
68 |
|
69 |
print(f"Generated {len(vectors)} vectors")
|
70 |
|
|
|
71 |
index.upsert(vectors=vectors)
|
72 |
print("Vectors upserted to Pinecone")
|
73 |
|
@@ -122,7 +138,6 @@ def chat_with_ai(message):
|
|
122 |
"doc_name": result['metadata']['doc_name'],
|
123 |
"chunk_index": result['metadata']['chunk_index'],
|
124 |
"text": result['metadata']['text'],
|
125 |
-
"type": result['metadata']['type']
|
126 |
}
|
127 |
for result in results
|
128 |
]
|
@@ -135,63 +150,29 @@ def clear_database():
|
|
135 |
print("Database cleared")
|
136 |
return "Database cleared successfully."
|
137 |
|
138 |
-
# Streamlit
|
139 |
-
st.set_page_config(
|
140 |
-
|
|
|
|
|
141 |
|
142 |
-
|
143 |
-
col1, col2, col3 = st.columns([1, 1, 1])
|
144 |
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
st.warning("No content uploaded. Please provide at least one input.")
|
164 |
-
st.session_state.upload_progress.empty()
|
165 |
-
|
166 |
-
if st.button("Clear Database"):
|
167 |
-
result = clear_database()
|
168 |
-
st.success(result)
|
169 |
-
|
170 |
-
with col2:
|
171 |
-
st.header("Chat")
|
172 |
-
user_input = st.text_input("Ask a question about the uploaded content:")
|
173 |
-
if st.button("Send"):
|
174 |
-
if user_input:
|
175 |
-
print(f"Sending user input: {user_input}")
|
176 |
-
st.session_state.chat_progress = st.progress(0)
|
177 |
-
response, sources = chat_with_ai(user_input)
|
178 |
-
st.session_state.chat_progress.progress(1.0)
|
179 |
-
st.markdown("**You:** " + user_input)
|
180 |
-
st.markdown("**AI:** " + response)
|
181 |
-
|
182 |
-
# Store sources in session state for display in col3
|
183 |
-
st.session_state.sources = sources
|
184 |
-
st.session_state.chat_progress.empty()
|
185 |
-
else:
|
186 |
-
print("Empty user input")
|
187 |
-
st.warning("Please enter a question.")
|
188 |
-
|
189 |
-
with col3:
|
190 |
-
st.header("Source Chunks")
|
191 |
-
if 'sources' in st.session_state and st.session_state.sources:
|
192 |
-
for i, source in enumerate(st.session_state.sources, 1):
|
193 |
-
with st.expander(f"Source {i} - {source['type']} ({source['doc_name']})"):
|
194 |
-
st.markdown(f"**Chunk Index:** {source['chunk_index']}")
|
195 |
-
st.text(source['text'])
|
196 |
-
else:
|
197 |
-
st.info("Ask a question to see source chunks here.")
|
|
|
57 |
chunks = [content[i:i+chunk_size] for i in range(0, content_length, chunk_size)]
|
58 |
|
59 |
vectors = []
|
60 |
+
total_chunks = len(chunks)
|
61 |
+
|
62 |
+
# Use st.session_state to manage progress bar across function calls if needed on the page
|
63 |
+
if 'upload_progress' in st.session_state and hasattr(st.session_state.upload_progress, 'progress'):
|
64 |
+
progress_bar = st.session_state.upload_progress
|
65 |
+
else:
|
66 |
+
# If called outside the context of the upload page button press, handle appropriately
|
67 |
+
# For now, let's assume it's called from the Upload page context where progress is set
|
68 |
+
pass
|
69 |
+
|
70 |
+
|
71 |
with ThreadPoolExecutor() as executor:
|
72 |
+
futures = {executor.submit(process_chunk, chunk, doc_id, i, upload_type, doc_name): i for i, chunk in enumerate(chunks)}
|
73 |
|
74 |
+
processed_count = 0
|
75 |
for future in as_completed(futures):
|
76 |
vectors.append(future.result())
|
77 |
+
processed_count += 1
|
78 |
+
# Update progress if progress_bar exists
|
79 |
+
if 'progress_bar' in locals() and progress_bar:
|
80 |
+
current_progress = processed_count / total_chunks
|
81 |
+
progress_bar.progress(current_progress)
|
82 |
+
|
83 |
|
84 |
print(f"Generated {len(vectors)} vectors")
|
85 |
|
86 |
+
# Consider batching upserts for very large documents
|
87 |
index.upsert(vectors=vectors)
|
88 |
print("Vectors upserted to Pinecone")
|
89 |
|
|
|
138 |
"doc_name": result['metadata']['doc_name'],
|
139 |
"chunk_index": result['metadata']['chunk_index'],
|
140 |
"text": result['metadata']['text'],
|
|
|
141 |
}
|
142 |
for result in results
|
143 |
]
|
|
|
150 |
print("Database cleared")
|
151 |
return "Database cleared successfully."
|
152 |
|
153 |
+
# Streamlit Main Page
|
154 |
+
st.set_page_config(
|
155 |
+
page_title="RAG Chat Home",
|
156 |
+
page_icon="👋",
|
157 |
+
)
|
158 |
|
159 |
+
st.title("Welcome to RAG Chat! 👋")
|
|
|
160 |
|
161 |
+
st.sidebar.success("Select a page above.")
|
162 |
+
|
163 |
+
st.markdown(
|
164 |
+
"""
|
165 |
+
This application allows you to upload PDF documents and chat with an AI
|
166 |
+
about their content.
|
167 |
+
|
168 |
+
**👈 Select a page from the sidebar** to get started:
|
169 |
+
- **Upload:** Add your PDF documents to the knowledge base.
|
170 |
+
- **Chat:** Ask questions about the documents you've uploaded.
|
171 |
+
|
172 |
+
The AI uses Retrieval-Augmented Generation (RAG) to find relevant sections
|
173 |
+
from your documents and provide informed answers.
|
174 |
+
"""
|
175 |
+
)
|
176 |
+
|
177 |
+
# No UI elements here, just the core logic and initialization above.
|
178 |
+
# The pages in the 'pages' directory will handle the UI.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
libgen_scraper.py
ADDED
@@ -0,0 +1,482 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
import subprocess
|
7 |
+
import uuid
|
8 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9 |
+
from urllib.parse import urlparse, unquote, parse_qs
|
10 |
+
from lxml import html # Import lxml
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
from pypdf import PdfReader # Use pypdf instead of PyPDF2
|
13 |
+
from openai import OpenAI
|
14 |
+
from pinecone import Pinecone
|
15 |
+
# cssselect is used by lxml's .cssselect() method, ensure it's installed
|
16 |
+
|
17 |
+
# --- Initialization ---
|
18 |
+
load_dotenv()
|
19 |
+
|
20 |
+
# Set up OpenAI client
|
21 |
+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
22 |
+
|
23 |
+
# Set up Pinecone
|
24 |
+
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
25 |
+
|
26 |
+
index_name = "main" # Your index name
|
27 |
+
try:
|
28 |
+
index = pc.Index(index_name)
|
29 |
+
print(f"Connected to Pinecone index: {index_name}")
|
30 |
+
# Optional: Check index stats
|
31 |
+
# print(index.describe_index_stats())
|
32 |
+
except Exception as e:
|
33 |
+
print(f"Error connecting to Pinecone index '{index_name}': {e}")
|
34 |
+
print("Please ensure the index exists and API keys are correct.")
|
35 |
+
exit()
|
36 |
+
|
37 |
+
# URL provided by the user
|
38 |
+
url = "https://libgen.rs/search.php?&req=topicid147&phrase=1&view=simple&column=topic&sort=def&sortmode=ASC&page=1"
|
39 |
+
|
40 |
+
# Headers mimicking a browser request (Removed Host)
|
41 |
+
base_headers = {
|
42 |
+
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
43 |
+
'accept-encoding': 'gzip, deflate, br, zstd',
|
44 |
+
'accept-language': 'en-US,en;q=0.9',
|
45 |
+
'connection': 'keep-alive',
|
46 |
+
'dnt': '1',
|
47 |
+
'sec-ch-ua': '"Not:A-Brand";v="24", "Chromium";v="134"',
|
48 |
+
'sec-ch-ua-mobile': '?0',
|
49 |
+
'sec-ch-ua-platform': '"macOS"',
|
50 |
+
'sec-fetch-dest': 'document',
|
51 |
+
'sec-fetch-mode': 'navigate',
|
52 |
+
'sec-fetch-site': 'same-origin', # May need adjustment if navigating between domains
|
53 |
+
'sec-fetch-user': '?1',
|
54 |
+
'upgrade-insecure-requests': '1',
|
55 |
+
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
|
56 |
+
}
|
57 |
+
|
58 |
+
print(f"Attempting to fetch: {url}")
|
59 |
+
|
60 |
+
try:
|
61 |
+
response = requests.get(url, headers=base_headers)
|
62 |
+
response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
|
63 |
+
print("Successfully fetched page content.")
|
64 |
+
|
65 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
66 |
+
|
67 |
+
# Find the main table containing search results (often class='c' in libgen)
|
68 |
+
# Inspecting the source, the table seems to be the 3rd table on the page,
|
69 |
+
# or more reliably, the one with width="100%" and rules="rows"
|
70 |
+
results_table = soup.find('table', {'rules': 'rows', 'width': '100%'})
|
71 |
+
|
72 |
+
book_links = []
|
73 |
+
base_url = "https://libgen.rs/"
|
74 |
+
|
75 |
+
if results_table:
|
76 |
+
print("Found results table. Processing rows...")
|
77 |
+
rows = results_table.find_all('tr')
|
78 |
+
print(f"Found {len(rows) - 1} potential book entries (excluding header).")
|
79 |
+
|
80 |
+
# Skip the header row (index 0)
|
81 |
+
for row in rows[1:]:
|
82 |
+
cells = row.find_all('td')
|
83 |
+
# Ensure the row has enough cells (at least 3 for the link)
|
84 |
+
if len(cells) > 2:
|
85 |
+
link_cell = cells[2] # The third column usually contains the title link
|
86 |
+
link_tag = link_cell.find('a')
|
87 |
+
if link_tag and link_tag.has_attr('href'):
|
88 |
+
relative_link = link_tag['href']
|
89 |
+
# Ensure it's a book link (often starts with 'book/')
|
90 |
+
if relative_link.startswith('book/'):
|
91 |
+
full_link = base_url + relative_link
|
92 |
+
book_links.append(full_link)
|
93 |
+
else:
|
94 |
+
print("Skipping row with insufficient cells.")
|
95 |
+
|
96 |
+
print(f"Extracted {len(book_links)} book links.")
|
97 |
+
|
98 |
+
# Save the links to a JSON file
|
99 |
+
output_filename = 'links.json'
|
100 |
+
with open(output_filename, 'w') as f:
|
101 |
+
json.dump(book_links, f, indent=4)
|
102 |
+
print(f"Successfully saved links to {output_filename}")
|
103 |
+
|
104 |
+
else:
|
105 |
+
print("Could not find the results table. Check the HTML structure or selectors.")
|
106 |
+
|
107 |
+
except requests.exceptions.RequestException as e:
|
108 |
+
print(f"Error fetching URL: {e}")
|
109 |
+
except Exception as e:
|
110 |
+
print(f"An error occurred: {e}")
|
111 |
+
|
112 |
+
# Known download host patterns (check hostname ENDS WITH these)
|
113 |
+
DOWNLOAD_HOST_PATTERNS = ['.library.lol', '.books.ms', '.libstc.cc']
|
114 |
+
|
115 |
+
def get_embedding(text):
|
116 |
+
try:
|
117 |
+
response = client.embeddings.create(input=text, model="text-embedding-3-large")
|
118 |
+
return response.data[0].embedding
|
119 |
+
except Exception as e:
|
120 |
+
print(f"Error getting embedding: {e}")
|
121 |
+
return None
|
122 |
+
|
123 |
+
def convert_djvu_to_pdf(djvu_filepath):
|
124 |
+
"""Converts a DJVU file to PDF using djvu2pdf command line tool."""
|
125 |
+
pdf_filepath = os.path.splitext(djvu_filepath)[0] + ".pdf"
|
126 |
+
command = ["djvu2pdf", djvu_filepath, pdf_filepath]
|
127 |
+
print(f"Converting {os.path.basename(djvu_filepath)} to PDF...")
|
128 |
+
try:
|
129 |
+
result = subprocess.run(command, check=True, capture_output=True, text=True)
|
130 |
+
print(f"Successfully converted to {os.path.basename(pdf_filepath)}")
|
131 |
+
return pdf_filepath
|
132 |
+
except FileNotFoundError:
|
133 |
+
print(f"Error: 'djvu2pdf' command not found. Please install djvulibre.")
|
134 |
+
return None
|
135 |
+
except subprocess.CalledProcessError as e:
|
136 |
+
print(f"Error during conversion: {e}")
|
137 |
+
print(f"Stderr: {e.stderr}")
|
138 |
+
return None
|
139 |
+
except Exception as e:
|
140 |
+
print(f"An unexpected error occurred during conversion: {e}")
|
141 |
+
return None
|
142 |
+
|
143 |
+
def process_and_upsert_pdf(pdf_filepath, original_filename):
|
144 |
+
"""Reads PDF, chunks text, gets embeddings, and upserts to Pinecone."""
|
145 |
+
print(f"Processing PDF for Pinecone: {os.path.basename(pdf_filepath)}")
|
146 |
+
doc_id = str(uuid.uuid4())
|
147 |
+
try:
|
148 |
+
reader = PdfReader(pdf_filepath)
|
149 |
+
text = ""
|
150 |
+
for page in reader.pages:
|
151 |
+
page_text = page.extract_text()
|
152 |
+
if page_text: # Add text only if extraction succeeded
|
153 |
+
text += page_text + "\n"
|
154 |
+
|
155 |
+
if not text:
|
156 |
+
print(f"Warning: No text extracted from {os.path.basename(pdf_filepath)}")
|
157 |
+
return f"Processed (No Text): {original_filename}"
|
158 |
+
|
159 |
+
content_length = len(text)
|
160 |
+
print(f"Extracted text length: {content_length}")
|
161 |
+
|
162 |
+
# Simple chunking (adjust size as needed)
|
163 |
+
chunk_size = 2000
|
164 |
+
chunks = [text[i:i+chunk_size] for i in range(0, content_length, chunk_size)]
|
165 |
+
print(f"Split into {len(chunks)} chunks.")
|
166 |
+
|
167 |
+
vectors = []
|
168 |
+
for i, chunk in enumerate(chunks):
|
169 |
+
embedding = get_embedding(chunk)
|
170 |
+
if embedding:
|
171 |
+
vectors.append((
|
172 |
+
f"{doc_id}_{i}",
|
173 |
+
embedding,
|
174 |
+
{
|
175 |
+
"text": chunk,
|
176 |
+
"type": "PDF",
|
177 |
+
"doc_id": doc_id,
|
178 |
+
"doc_name": original_filename, # Store the original filename
|
179 |
+
"chunk_index": i
|
180 |
+
}
|
181 |
+
))
|
182 |
+
else:
|
183 |
+
print(f"Skipping chunk {i} due to embedding error.")
|
184 |
+
|
185 |
+
if not vectors:
|
186 |
+
print("No vectors generated for upsert.")
|
187 |
+
return f"Processed (No Vectors): {original_filename}"
|
188 |
+
|
189 |
+
# Upsert in batches if necessary (Pinecone recommends batching)
|
190 |
+
batch_size = 100
|
191 |
+
for i in range(0, len(vectors), batch_size):
|
192 |
+
batch = vectors[i:i+batch_size]
|
193 |
+
try:
|
194 |
+
index.upsert(vectors=batch)
|
195 |
+
print(f"Upserted batch {i//batch_size + 1} to Pinecone.")
|
196 |
+
except Exception as e:
|
197 |
+
print(f"Error upserting batch to Pinecone: {e}")
|
198 |
+
# Decide if you want to stop or continue with other batches
|
199 |
+
|
200 |
+
print(f"Successfully processed and upserted {original_filename} to Pinecone.")
|
201 |
+
return f"Upserted: {original_filename}"
|
202 |
+
|
203 |
+
except Exception as e:
|
204 |
+
print(f"Error processing PDF {os.path.basename(pdf_filepath)}: {e}")
|
205 |
+
return f"Error (Processing): {original_filename}"
|
206 |
+
|
207 |
+
def get_final_download_link(intermediate_page_url):
|
208 |
+
"""Visits an intermediate page (e.g., books.ms/main/HASH)
|
209 |
+
and scrapes the final download link using the selector #download a.
|
210 |
+
"""
|
211 |
+
print(f"Fetching final link from intermediate page: {intermediate_page_url}")
|
212 |
+
try:
|
213 |
+
# Update Host header for the specific request
|
214 |
+
request_headers = base_headers.copy()
|
215 |
+
parsed_url = urlparse(intermediate_page_url)
|
216 |
+
if parsed_url.netloc:
|
217 |
+
request_headers['Host'] = parsed_url.netloc
|
218 |
+
|
219 |
+
response = requests.get(intermediate_page_url, headers=request_headers, timeout=20)
|
220 |
+
response.raise_for_status()
|
221 |
+
|
222 |
+
tree = html.fromstring(response.content)
|
223 |
+
found_link = None
|
224 |
+
|
225 |
+
# --- Attempt the simple, effective selector ---
|
226 |
+
css_selector = "#download a" # Target first anchor within #download
|
227 |
+
print(f"Attempting CSS selector: {css_selector}")
|
228 |
+
link_elements = tree.cssselect(css_selector)
|
229 |
+
if link_elements:
|
230 |
+
link_tag = link_elements[0] # Take the first one found
|
231 |
+
href = link_tag.get('href')
|
232 |
+
if href:
|
233 |
+
parsed_href = urlparse(href)
|
234 |
+
# Validation:
|
235 |
+
if (parsed_href.scheme and parsed_href.netloc and
|
236 |
+
'/main/' in parsed_href.path and
|
237 |
+
any(parsed_href.hostname.endswith(pattern) for pattern in DOWNLOAD_HOST_PATTERNS)):
|
238 |
+
print(f"Found final download link via CSS selector: {href}")
|
239 |
+
found_link = href
|
240 |
+
else:
|
241 |
+
# If the first link doesn't validate, maybe log it but don't proceed
|
242 |
+
print(f"Selector '{css_selector}' found link, but failed validation: {href}")
|
243 |
+
else:
|
244 |
+
print(f"Selector '{css_selector}' found link tag, but it has no href.")
|
245 |
+
else:
|
246 |
+
print(f"CSS selector {css_selector} did not find any elements.")
|
247 |
+
|
248 |
+
# --- Return result ---
|
249 |
+
if found_link:
|
250 |
+
return found_link
|
251 |
+
else:
|
252 |
+
# If no valid link was found after checking the first #download a
|
253 |
+
print(f"Could not find valid download link using CSS selector '{css_selector}' on {intermediate_page_url}")
|
254 |
+
return None
|
255 |
+
|
256 |
+
except requests.exceptions.Timeout:
|
257 |
+
print(f"Timeout error fetching intermediate page {intermediate_page_url}")
|
258 |
+
return None
|
259 |
+
except requests.exceptions.RequestException as e:
|
260 |
+
if e.response is not None:
|
261 |
+
print(f"Error fetching intermediate page {intermediate_page_url}: Status {e.response.status_code}")
|
262 |
+
else:
|
263 |
+
print(f"Error fetching intermediate page {intermediate_page_url}: {e}")
|
264 |
+
return None
|
265 |
+
except Exception as e:
|
266 |
+
print(f"Error parsing intermediate page {intermediate_page_url} with lxml: {e}")
|
267 |
+
return None
|
268 |
+
|
269 |
+
def download_file_and_process(download_url, download_dir):
|
270 |
+
"""Downloads file, converts DJVU to PDF if needed, and triggers Pinecone upsert.
|
271 |
+
Returns a status message.
|
272 |
+
"""
|
273 |
+
if not download_url:
|
274 |
+
return "Skipped: No download URL provided."
|
275 |
+
|
276 |
+
processing_status = "" # To store the outcome of PDF processing/upserting
|
277 |
+
original_filename = "Unknown"
|
278 |
+
final_filepath = None # Path to the file to be processed (PDF)
|
279 |
+
djvu_filepath_to_delete = None
|
280 |
+
|
281 |
+
try:
|
282 |
+
# --- Downloading ---
|
283 |
+
parsed_url = urlparse(download_url)
|
284 |
+
path_parts = [part for part in parsed_url.path.split('/') if part]
|
285 |
+
filename_base = unquote(path_parts[-1]) if path_parts else f"download_{int(time.time())}"
|
286 |
+
original_filename = filename_base # Keep original name for metadata
|
287 |
+
|
288 |
+
print(f"Attempting to download: {download_url}")
|
289 |
+
response = requests.get(download_url, headers=base_headers, stream=True, timeout=120) # Increased timeout
|
290 |
+
response.raise_for_status()
|
291 |
+
|
292 |
+
# --- Determine File Path and Extension ---
|
293 |
+
content_disposition = response.headers.get('Content-Disposition')
|
294 |
+
extension = '.pdf' # Default
|
295 |
+
if content_disposition:
|
296 |
+
if 'filename=' in content_disposition:
|
297 |
+
disp_filename = content_disposition.split('filename=')[-1].strip('"\'')
|
298 |
+
if '.' in disp_filename:
|
299 |
+
extension = os.path.splitext(disp_filename)[1].lower()
|
300 |
+
else:
|
301 |
+
# Check extension from URL path if no content-disposition
|
302 |
+
if '.' in filename_base:
|
303 |
+
url_ext = os.path.splitext(filename_base)[1].lower()
|
304 |
+
if url_ext in ['.pdf', '.djvu', '.epub', '.mobi']: # Add other relevant types if needed
|
305 |
+
extension = url_ext
|
306 |
+
|
307 |
+
filename = filename_base
|
308 |
+
if not filename.lower().endswith(extension):
|
309 |
+
filename += extension
|
310 |
+
|
311 |
+
download_filepath = os.path.join(download_dir, filename)
|
312 |
+
|
313 |
+
# --- Save File ---
|
314 |
+
if os.path.exists(download_filepath):
|
315 |
+
print(f"File already exists: {filename}")
|
316 |
+
# Decide if we should still process it for Pinecone
|
317 |
+
if download_filepath.lower().endswith('.pdf'):
|
318 |
+
final_filepath = download_filepath
|
319 |
+
elif download_filepath.lower().endswith('.djvu'):
|
320 |
+
# Check if corresponding PDF exists from previous run
|
321 |
+
pdf_equiv = os.path.splitext(download_filepath)[0] + ".pdf"
|
322 |
+
if os.path.exists(pdf_equiv):
|
323 |
+
print(f"Corresponding PDF already exists: {os.path.basename(pdf_equiv)}")
|
324 |
+
final_filepath = pdf_equiv
|
325 |
+
else:
|
326 |
+
# Convert existing DJVU
|
327 |
+
print("DJVU exists but PDF doesn't. Converting...")
|
328 |
+
converted_pdf = convert_djvu_to_pdf(download_filepath)
|
329 |
+
if converted_pdf:
|
330 |
+
final_filepath = converted_pdf
|
331 |
+
else:
|
332 |
+
return f"Skipped (Exists, Non-PDF/DJVU): {filename}"
|
333 |
+
else:
|
334 |
+
print(f"Downloading to: {download_filepath}")
|
335 |
+
with open(download_filepath, 'wb') as f:
|
336 |
+
for chunk in response.iter_content(chunk_size=8192):
|
337 |
+
f.write(chunk)
|
338 |
+
print(f"Successfully downloaded: {filename}")
|
339 |
+
|
340 |
+
# --- Post-Download Processing ---
|
341 |
+
if download_filepath.lower().endswith('.pdf'):
|
342 |
+
final_filepath = download_filepath
|
343 |
+
elif download_filepath.lower().endswith('.djvu'):
|
344 |
+
converted_pdf = convert_djvu_to_pdf(download_filepath)
|
345 |
+
if converted_pdf:
|
346 |
+
final_filepath = converted_pdf
|
347 |
+
djvu_filepath_to_delete = download_filepath # Mark original for deletion
|
348 |
+
else:
|
349 |
+
print(f"Downloaded non-PDF/DJVU file: {filename}. Skipping Pinecone process.")
|
350 |
+
return f"Success (DL Only): {filename}"
|
351 |
+
|
352 |
+
# --- Pinecone Upsert Trigger ---
|
353 |
+
if final_filepath and os.path.exists(final_filepath):
|
354 |
+
processing_status = process_and_upsert_pdf(final_filepath, original_filename)
|
355 |
+
|
356 |
+
# Optional: Delete original DJVU after successful conversion and processing
|
357 |
+
if djvu_filepath_to_delete and 'Error' not in processing_status:
|
358 |
+
try:
|
359 |
+
os.remove(djvu_filepath_to_delete)
|
360 |
+
print(f"Deleted original DJVU: {os.path.basename(djvu_filepath_to_delete)}")
|
361 |
+
except Exception as e:
|
362 |
+
print(f"Error deleting DJVU file {os.path.basename(djvu_filepath_to_delete)}: {e}")
|
363 |
+
|
364 |
+
else:
|
365 |
+
processing_status = "Skipped Upsert (No PDF)"
|
366 |
+
|
367 |
+
return f"Download OK. Status: {processing_status}"
|
368 |
+
|
369 |
+
except requests.exceptions.Timeout:
|
370 |
+
print(f"Timeout error downloading {download_url}")
|
371 |
+
return f"Error (Timeout): {original_filename}"
|
372 |
+
except requests.exceptions.RequestException as e:
|
373 |
+
print(f"Error downloading {download_url}: {e}")
|
374 |
+
return f"Error (RequestException): {original_filename}"
|
375 |
+
except Exception as e:
|
376 |
+
print(f"An unexpected error occurred during download/process of {original_filename}: {e}")
|
377 |
+
return f"Error (Unexpected): {original_filename}"
|
378 |
+
|
379 |
+
# --- Main Execution ---
|
380 |
+
|
381 |
+
input_filename = 'links.json'
|
382 |
+
download_dir = 'downloads'
|
383 |
+
max_workers = 3 # Reduce workers slightly due to processing load
|
384 |
+
|
385 |
+
# Create download directory if it doesn't exist
|
386 |
+
if not os.path.exists(download_dir):
|
387 |
+
os.makedirs(download_dir)
|
388 |
+
print(f"Created directory: {download_dir}")
|
389 |
+
|
390 |
+
# --- Read original libgen.rs book page URLs ---
|
391 |
+
try:
|
392 |
+
with open(input_filename, 'r') as f:
|
393 |
+
# Load all URLs as originally intended
|
394 |
+
libgen_book_page_urls = json.load(f)
|
395 |
+
print(f"Loaded {len(libgen_book_page_urls)} libgen.rs book page URLs from {input_filename}")
|
396 |
+
|
397 |
+
except FileNotFoundError:
|
398 |
+
print(f"Error: {input_filename} not found. Please run the initial link scraping part first.")
|
399 |
+
exit()
|
400 |
+
except json.JSONDecodeError:
|
401 |
+
print(f"Error: Could not decode JSON from {input_filename}.")
|
402 |
+
exit()
|
403 |
+
|
404 |
+
if not libgen_book_page_urls:
|
405 |
+
print("No book page URLs found in the file. Exiting.")
|
406 |
+
exit()
|
407 |
+
|
408 |
+
# --- Stage 1: Construct intermediate URLs and get final download links ---
|
409 |
+
final_download_links = []
|
410 |
+
intermediate_urls_to_try = []
|
411 |
+
|
412 |
+
print("\n--- Constructing Intermediate URLs ---")
|
413 |
+
# Process all URLs again
|
414 |
+
for url in libgen_book_page_urls:
|
415 |
+
try:
|
416 |
+
parsed_libgen_url = urlparse(url)
|
417 |
+
query_params = parse_qs(parsed_libgen_url.query)
|
418 |
+
md5_list = query_params.get('md5')
|
419 |
+
if md5_list:
|
420 |
+
md5 = md5_list[0]
|
421 |
+
intermediate_url = f"http://books.ms/main/{md5}"
|
422 |
+
intermediate_urls_to_try.append(intermediate_url)
|
423 |
+
# Maybe remove verbose printing for full run
|
424 |
+
# print(f"Constructed: {intermediate_url} from {url}")
|
425 |
+
else:
|
426 |
+
print(f"Could not extract MD5 from {url}")
|
427 |
+
except Exception as e:
|
428 |
+
print(f"Error processing libgen URL {url}: {e}")
|
429 |
+
|
430 |
+
print(f"\n--- Fetching Final Download Links from {len(intermediate_urls_to_try)} intermediate URLs ---")
|
431 |
+
if intermediate_urls_to_try:
|
432 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
433 |
+
# Use the renamed function get_final_download_link
|
434 |
+
future_to_url = {executor.submit(get_final_download_link, intermediate_url): intermediate_url for intermediate_url in intermediate_urls_to_try}
|
435 |
+
for future in as_completed(future_to_url):
|
436 |
+
intermediate_url = future_to_url[future]
|
437 |
+
try:
|
438 |
+
result = future.result()
|
439 |
+
if result:
|
440 |
+
final_download_links.append(result)
|
441 |
+
except Exception as exc:
|
442 |
+
print(f'Fetching final download link for {intermediate_url} generated an exception: {exc}')
|
443 |
+
|
444 |
+
print(f"\nFound {len(final_download_links)} final download links.")
|
445 |
+
|
446 |
+
# --- Stage 2: Download, Convert, and Process files concurrently ---
|
447 |
+
print("\n--- Downloading, Converting, Processing Files ---")
|
448 |
+
download_process_results = []
|
449 |
+
if final_download_links:
|
450 |
+
# Use the new function that handles download, conversion, and upsert trigger
|
451 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
452 |
+
future_to_download = {executor.submit(download_file_and_process, link, download_dir): link for link in final_download_links}
|
453 |
+
for future in as_completed(future_to_download):
|
454 |
+
link = future_to_download[future]
|
455 |
+
try:
|
456 |
+
result_message = future.result()
|
457 |
+
download_process_results.append(result_message)
|
458 |
+
print(f"Overall Result for {link}: {result_message}")
|
459 |
+
except Exception as exc:
|
460 |
+
print(f'Download/Processing {link} generated an exception: {exc}')
|
461 |
+
download_process_results.append(f"Error (Exception): {link}")
|
462 |
+
else:
|
463 |
+
print("No final download links found, skipping download/process stage.")
|
464 |
+
|
465 |
+
# --- Final Summary ---
|
466 |
+
print("\n--- Final Summary ---")
|
467 |
+
# Analyze the results strings for a more detailed summary (optional)
|
468 |
+
success_upsert_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Upserted'))
|
469 |
+
success_dl_only_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Success (DL Only)'))
|
470 |
+
success_no_text_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Processed (No Text)'))
|
471 |
+
skipped_dl_count = sum(1 for r in download_process_results if r.startswith('Skipped'))
|
472 |
+
error_count = len(download_process_results) - success_upsert_count - success_dl_only_count - success_no_text_count - skipped_dl_count
|
473 |
+
|
474 |
+
print(f"Total final links attempted: {len(final_download_links)}")
|
475 |
+
print(f"Successfully Downloaded & Upserted to Pinecone: {success_upsert_count}")
|
476 |
+
print(f"Successfully Downloaded (Non-PDF/DJVU or Skipped Upsert): {success_dl_only_count + success_no_text_count}")
|
477 |
+
print(f"Skipped Download (e.g., already exists): {skipped_dl_count}")
|
478 |
+
print(f"Errors (Download/Conversion/Process/Upsert): {error_count}")
|
479 |
+
|
480 |
+
print(f"\nDownloads attempted in the '{download_dir}' directory.")
|
481 |
+
|
482 |
+
# --- End Main Execution ---
|
links.json
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
"https://libgen.rs/book/index.php?md5=372E34D136FBF39DCE00460D9E8F1F52",
|
3 |
+
"https://libgen.rs/book/index.php?md5=EB252D785B9D104EC533CF5326D89DEF",
|
4 |
+
"https://libgen.rs/book/index.php?md5=824483B0D731CBB3221C722455E3CAC8",
|
5 |
+
"https://libgen.rs/book/index.php?md5=CB207C615844574D6384B210090C46D0",
|
6 |
+
"https://libgen.rs/book/index.php?md5=270015C07518D5C2293CD613D3B75F9D",
|
7 |
+
"https://libgen.rs/book/index.php?md5=560ECD8FBD2BA7D757B5A51DA042B50F",
|
8 |
+
"https://libgen.rs/book/index.php?md5=3E9400AA7C4C99881ED7EDA013A27C0E",
|
9 |
+
"https://libgen.rs/book/index.php?md5=8558FC34C2C407926FF051201EFEDD50",
|
10 |
+
"https://libgen.rs/book/index.php?md5=1D4206FF2F3B3EB3D7E15A95D31E5B18",
|
11 |
+
"https://libgen.rs/book/index.php?md5=C7202A81EFF198F776DB05CB8EFBA0BD",
|
12 |
+
"https://libgen.rs/book/index.php?md5=4AEE828F0332D2FBC34210AA79602FB3",
|
13 |
+
"https://libgen.rs/book/index.php?md5=3EC35852AD8E56F72EF977EF636070DC",
|
14 |
+
"https://libgen.rs/book/index.php?md5=C5A01443018DD3AFA28CBF8CF0AF8CE8",
|
15 |
+
"https://libgen.rs/book/index.php?md5=0D5CB47FF53DD63D764ACEE476B3C2AB",
|
16 |
+
"https://libgen.rs/book/index.php?md5=E4D3667D48E2DF2FE6491EE88FCFBB79"
|
17 |
+
]
|
pages/1_Upload.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from app import process_upload, clear_database
|
3 |
+
|
4 |
+
st.title("Upload PDFs")
|
5 |
+
|
6 |
+
st.header("Upload")
|
7 |
+
|
8 |
+
# PDF upload
|
9 |
+
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
|
10 |
+
|
11 |
+
if st.button("Process All"):
|
12 |
+
if 'upload_progress' not in st.session_state:
|
13 |
+
st.session_state.upload_progress = st.progress(0)
|
14 |
+
|
15 |
+
with st.spinner("Processing uploads..."):
|
16 |
+
results = []
|
17 |
+
if uploaded_file:
|
18 |
+
pdf_result = process_upload("PDF", uploaded_file, uploaded_file.name)
|
19 |
+
results.append(pdf_result)
|
20 |
+
|
21 |
+
if results:
|
22 |
+
for result in results:
|
23 |
+
st.success(result)
|
24 |
+
else:
|
25 |
+
st.warning("No content uploaded. Please provide at least one input.")
|
26 |
+
|
27 |
+
# Check if the progress bar exists before trying to empty it
|
28 |
+
if hasattr(st.session_state, 'upload_progress'):
|
29 |
+
st.session_state.upload_progress.empty()
|
30 |
+
del st.session_state.upload_progress # Clean up state
|
31 |
+
|
32 |
+
if st.button("Clear Database"):
|
33 |
+
result = clear_database()
|
34 |
+
st.success(result)
|
pages/2_Chat.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from app import chat_with_ai
|
3 |
+
|
4 |
+
st.title("Chat with PDFs")
|
5 |
+
|
6 |
+
# Initialize session state for sources if it doesn't exist
|
7 |
+
if 'sources' not in st.session_state:
|
8 |
+
st.session_state.sources = None
|
9 |
+
|
10 |
+
# Create two columns for chat and sources
|
11 |
+
col1, col2 = st.columns([2, 1])
|
12 |
+
|
13 |
+
with col1:
|
14 |
+
st.header("Chat")
|
15 |
+
user_input = st.text_input("Ask a question about the uploaded content:")
|
16 |
+
if st.button("Send"):
|
17 |
+
if user_input:
|
18 |
+
print(f"Sending user input: {user_input}")
|
19 |
+
st.session_state.chat_progress = st.progress(0)
|
20 |
+
response, sources = chat_with_ai(user_input)
|
21 |
+
st.session_state.chat_progress.progress(1.0)
|
22 |
+
st.markdown("**You:** " + user_input)
|
23 |
+
st.markdown("**AI:** " + response)
|
24 |
+
|
25 |
+
# Store sources in session state for display in col2
|
26 |
+
st.session_state.sources = sources
|
27 |
+
st.session_state.chat_progress.empty()
|
28 |
+
# Clean up state
|
29 |
+
del st.session_state.chat_progress
|
30 |
+
else:
|
31 |
+
print("Empty user input")
|
32 |
+
st.warning("Please enter a question.")
|
33 |
+
|
34 |
+
with col2:
|
35 |
+
st.header("Source Chunks")
|
36 |
+
if st.session_state.sources:
|
37 |
+
for i, source in enumerate(st.session_state.sources, 1):
|
38 |
+
with st.expander(f"Source {i} - {source['doc_name']}"):
|
39 |
+
st.markdown(f"**Chunk Index:** {source['chunk_index']}")
|
40 |
+
st.text(source['text'])
|
41 |
+
else:
|
42 |
+
st.info("Ask a question to see source chunks here.")
|