from fastapi import FastAPI, HTTPException from fastapi.responses import JSONResponse from fastapi.middleware.cors import CORSMiddleware import requests from AWSClaude import AWSClaude import json import concurrent.futures import time app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) @app.post("/get_n_depth_results") async def get_n_depth_results(url,input_query): all_content = {} def add_pdf_content(selected_pdf): for pdf_url in selected_pdf: print(pdf_url) response = requests.get(pdf_url) # Save the content of the response as a PDF file pdf_path = "temp.pdf" with open(pdf_path, "wb") as file: file.write(response.content) print(f"PDF file saved as {pdf_path}") url = "http://localhost:5000/ask" # url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v2" data = {"processTables": "True"} headers = {"Origin": "http://localhost:8080"} with open(pdf_path, "rb") as file: file_contents = file.read() files = { "pdf": ( pdf_path, file_contents, "application/pdf", ) } response = requests.post(url, files=files, data=data, headers=headers) all_content[pdf_url] = response.json() def scrapper(input_url): params = {'url': input_url} headers = {'accept': 'application/json'} url = 'https://chromium-qpxamiokfa-uc.a.run.app/get_scraped_data' response = requests.get(url, headers=headers, params=params) all_url = response.json()["URL"] all_content[input_url] = response.json()["Content"] return all_url pdf_urls = [] def separate_pdf_and_nonPDF_links(urls): # Separate URLs into two lists pdf_links = [url for url in urls if url and url.endswith('.pdf')] if pdf_links: pdf_urls.append(pdf_links) return [url for url in urls if not (url and url.endswith('.pdf'))] # other links for rescraping def call_llm_service(scraped_data, input_url, input_query, pdf): query = f""" Here are my scraped links: {scraped_data} correct hostname: {input_url} use this host name for all other tasks I need the always full (www.hotname.com/../) {pdf} URLs for the most relevant links related to "{input_query}". use the correct hostname from this provided content, give raw hyperlink with json format only don't give extra text details. only give json output example json format is only links don't include keys (i need the always full (www.hotname.com/../)) """ llm = "ClaudeHaiku" env = "" user_id = "KAusXF7jp0Q40urdZWtDLXEhrmA" thread_id = "hKxvoVgi7vRJCHhvMzH5" stream_id = "stream1" app_type = "sentinel" other_request_params = {"messages": [ {"role": "user", "content": query}, ]} return AWSClaude(llm, env, user_id, thread_id, stream_id, app_type, other_request_params).invoke() input_url = f'["{url}"]' input_query = input_query for step in range(1, 3): print(f"=================={step} step of scraping to get selected URLs from LLM=================================") next_urls = [] with concurrent.futures.ThreadPoolExecutor() as executor: futures = [executor.submit(scrapper, input_url) for input_url in (json.loads(input_url)[:2])] for future in concurrent.futures.as_completed(futures): next_urls.append(separate_pdf_and_nonPDF_links(future.result())) selected_links_from_llm = call_llm_service(next_urls, input_url, input_query, "") input_url = selected_links_from_llm print(json.loads(input_url)[:2]) if not pdf_urls: print(pdf_urls) #return all_content.keys() return all_content else: selected_pdf = json.loads(call_llm_service(pdf_urls, input_url, input_query, "only end with .pdf extension")) print(pdf_urls) print("selected pdf") print(selected_pdf) #return all_content.keys() return all_content # # Start time # start_time = time.time() # print(main("https://www.keells.com/", "Please analyse reports")) # # End time # end_time = time.time() # # Calculate the time taken # time_taken = end_time - start_time # print(f"Time taken: {time_taken} seconds")