embeding_api / main.py
Arafath10's picture
Update main.py
91bee69 verified
raw
history blame
4.72 kB
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import requests
from AWSClaude import AWSClaude
import json
import concurrent.futures
import time
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.post("/get_n_depth_results")
async def get_n_depth_results(url,input_query):
all_content = {}
def add_pdf_content(selected_pdf):
for pdf_url in selected_pdf:
print(pdf_url)
response = requests.get(pdf_url)
# Save the content of the response as a PDF file
pdf_path = "temp.pdf"
with open(pdf_path, "wb") as file:
file.write(response.content)
print(f"PDF file saved as {pdf_path}")
url = "http://localhost:5000/ask"
# url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v2"
data = {"processTables": "True"}
headers = {"Origin": "http://localhost:8080"}
with open(pdf_path, "rb") as file:
file_contents = file.read()
files = {
"pdf": (
pdf_path,
file_contents,
"application/pdf",
)
}
response = requests.post(url, files=files, data=data, headers=headers)
all_content[pdf_url] = response.json()
def scrapper(input_url):
params = {'url': input_url}
headers = {'accept': 'application/json'}
url = 'https://chromium-qpxamiokfa-uc.a.run.app/get_scraped_data'
response = requests.get(url, headers=headers, params=params)
all_url = response.json()["URL"]
all_content[input_url] = response.json()["Content"]
return all_url
pdf_urls = []
def separate_pdf_and_nonPDF_links(urls):
# Separate URLs into two lists
pdf_links = [url for url in urls if url and url.endswith('.pdf')]
if pdf_links:
pdf_urls.append(pdf_links)
return [url for url in urls if not (url and url.endswith('.pdf'))] # other links for rescraping
def call_llm_service(scraped_data, input_url, input_query, pdf):
query = f"""
Here are my scraped links:
{scraped_data}
correct hostname: {input_url} use this host name for all other tasks
I need the always full (www.hotname.com/../) {pdf} URLs for the most relevant links related to "{input_query}". use the correct hostname from this provided content, give raw hyperlink with json format only don't give extra text details. only give json output
example json format is only links don't include keys (i need the always full (www.hotname.com/../))
"""
llm = "ClaudeHaiku"
env = ""
user_id = "KAusXF7jp0Q40urdZWtDLXEhrmA"
thread_id = "hKxvoVgi7vRJCHhvMzH5"
stream_id = "stream1"
app_type = "sentinel"
other_request_params = {"messages": [
{"role": "user", "content": query},
]}
return AWSClaude(llm, env, user_id, thread_id, stream_id, app_type, other_request_params).invoke()
input_url = f'["{url}"]'
input_query = input_query
for step in range(1, 3):
print(f"=================={step} step of scraping to get selected URLs from LLM=================================")
next_urls = []
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = [executor.submit(scrapper, input_url) for input_url in (json.loads(input_url)[:2])]
for future in concurrent.futures.as_completed(futures):
next_urls.append(separate_pdf_and_nonPDF_links(future.result()))
selected_links_from_llm = call_llm_service(next_urls, input_url, input_query, "")
input_url = selected_links_from_llm
print(json.loads(input_url)[:2])
if not pdf_urls:
print(pdf_urls)
#return all_content.keys()
return all_content
else:
selected_pdf = json.loads(call_llm_service(pdf_urls, input_url, input_query, "only end with .pdf extension"))
print(pdf_urls)
print("selected pdf")
print(selected_pdf)
#return all_content.keys()
return all_content
# # Start time
# start_time = time.time()
# print(main("https://www.keells.com/", "Please analyse reports"))
# # End time
# end_time = time.time()
# # Calculate the time taken
# time_taken = end_time - start_time
# print(f"Time taken: {time_taken} seconds")