Spaces:
Sleeping
Sleeping
from fastapi import FastAPI, HTTPException | |
from fastapi.responses import JSONResponse | |
from fastapi.middleware.cors import CORSMiddleware | |
import requests | |
from AWSClaude import AWSClaude | |
import json | |
import concurrent.futures | |
import time | |
app = FastAPI() | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
async def get_n_depth_results(url,input_query): | |
all_content = {} | |
def add_pdf_content(selected_pdf): | |
for pdf_url in selected_pdf: | |
print(pdf_url) | |
response = requests.get(pdf_url) | |
# Save the content of the response as a PDF file | |
pdf_path = "temp.pdf" | |
with open(pdf_path, "wb") as file: | |
file.write(response.content) | |
print(f"PDF file saved as {pdf_path}") | |
url = "http://localhost:5000/ask" | |
# url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v2" | |
data = {"processTables": "True"} | |
headers = {"Origin": "http://localhost:8080"} | |
with open(pdf_path, "rb") as file: | |
file_contents = file.read() | |
files = { | |
"pdf": ( | |
pdf_path, | |
file_contents, | |
"application/pdf", | |
) | |
} | |
response = requests.post(url, files=files, data=data, headers=headers) | |
all_content[pdf_url] = response.json() | |
def scrapper(input_url): | |
params = {'url': input_url} | |
headers = {'accept': 'application/json'} | |
url = 'https://chromium-qpxamiokfa-uc.a.run.app/get_scraped_data' | |
try: | |
response = requests.get(url, headers=headers, params=params) | |
all_url = response.json()["URL"] | |
all_content[input_url] = response.json()["Content"] | |
return all_url | |
except: | |
print(f"found a error url : {input_url}=========================================") | |
return "none" | |
pdf_urls = [] | |
def separate_pdf_and_nonPDF_links(urls): | |
# Separate URLs into two lists | |
pdf_links = [url for url in urls if url and url.endswith('.pdf')] | |
if pdf_links: | |
pdf_urls.append(pdf_links) | |
return [url for url in urls if not (url and url.endswith('.pdf'))] # other links for rescraping | |
def call_llm_service(scraped_data, input_url, input_query, pdf): | |
query = f""" | |
Here are my scraped links: | |
{scraped_data} | |
correct hostname: {input_url} use this host name for all other tasks | |
I need the always full (www.hotname.com/../) {pdf} URLs for the most relevant links related to "{input_query}". use the correct hostname from this provided content, give raw hyperlink with json format only don't give extra text details. only give json output | |
example json format is only links don't include keys (i need the always full (www.hotname.com/../)) | |
""" | |
llm = "ClaudeHaiku" | |
env = "" | |
user_id = "KAusXF7jp0Q40urdZWtDLXEhrmA" | |
thread_id = "hKxvoVgi7vRJCHhvMzH5" | |
stream_id = "stream1" | |
app_type = "sentinel" | |
other_request_params = {"messages": [ | |
{"role": "user", "content": query}, | |
]} | |
return AWSClaude(llm, env, user_id, thread_id, stream_id, app_type, other_request_params).invoke() | |
input_url = f'["{url}"]' | |
input_query = input_query | |
for step in range(1, 3): | |
print(f"=================={step} step of scraping to get selected URLs from LLM=================================") | |
next_urls = [] | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
futures = [executor.submit(scrapper, input_url) for input_url in (json.loads(input_url)[:2])] | |
for future in concurrent.futures.as_completed(futures): | |
next_urls.append(separate_pdf_and_nonPDF_links(future.result())) | |
selected_links_from_llm = call_llm_service(next_urls, input_url, input_query, "") | |
input_url = selected_links_from_llm | |
print(json.loads(input_url)[:2]) | |
if not pdf_urls: | |
print(pdf_urls) | |
#return all_content.keys() | |
return all_content | |
else: | |
selected_pdf = json.loads(call_llm_service(pdf_urls, input_url, input_query, "only end with .pdf extension")) | |
print(pdf_urls) | |
print("selected pdf") | |
print(selected_pdf) | |
#return all_content.keys() | |
return all_content | |
# # Start time | |
# start_time = time.time() | |
# print(main("https://www.keells.com/", "Please analyse reports")) | |
# # End time | |
# end_time = time.time() | |
# # Calculate the time taken | |
# time_taken = end_time - start_time | |
# print(f"Time taken: {time_taken} seconds") | |