Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
@@ -1,13 +1,20 @@
|
|
1 |
-
from fastapi import FastAPI, HTTPException
|
2 |
from fastapi.responses import JSONResponse
|
3 |
from fastapi.middleware.cors import CORSMiddleware
|
4 |
-
|
5 |
-
|
6 |
-
import
|
7 |
-
import
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
app = FastAPI()
|
|
|
11 |
app.add_middleware(
|
12 |
CORSMiddleware,
|
13 |
allow_origins=["*"],
|
@@ -16,127 +23,40 @@ app.add_middleware(
|
|
16 |
allow_headers=["*"],
|
17 |
)
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
|
|
|
|
20 |
|
21 |
-
|
22 |
-
async def get_n_depth_results(url,input_query):
|
23 |
-
# Start time
|
24 |
-
start_time = time.time()
|
25 |
-
all_content = {}
|
26 |
-
|
27 |
-
def add_pdf_content(selected_pdf):
|
28 |
-
for pdf_url in selected_pdf:
|
29 |
-
print(pdf_url)
|
30 |
-
response = requests.get(pdf_url)
|
31 |
-
|
32 |
-
# Save the content of the response as a PDF file
|
33 |
-
pdf_path = "temp.pdf"
|
34 |
-
with open(pdf_path, "wb") as file:
|
35 |
-
file.write(response.content)
|
36 |
-
|
37 |
-
print(f"PDF file saved as {pdf_path}")
|
38 |
-
|
39 |
-
url = "http://localhost:5000/ask"
|
40 |
-
# url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v2"
|
41 |
-
|
42 |
-
data = {"processTables": "True"}
|
43 |
-
|
44 |
-
headers = {"Origin": "http://localhost:8080"}
|
45 |
-
|
46 |
-
with open(pdf_path, "rb") as file:
|
47 |
-
file_contents = file.read()
|
48 |
-
|
49 |
-
files = {
|
50 |
-
"pdf": (
|
51 |
-
pdf_path,
|
52 |
-
file_contents,
|
53 |
-
"application/pdf",
|
54 |
-
)
|
55 |
-
}
|
56 |
-
|
57 |
-
response = requests.post(url, files=files, data=data, headers=headers)
|
58 |
-
all_content[pdf_url] = response.json()
|
59 |
-
|
60 |
-
def scrapper(input_url):
|
61 |
-
params = {'url': input_url}
|
62 |
-
headers = {'accept': 'application/json'}
|
63 |
-
url = 'https://chromium-qpxamiokfa-uc.a.run.app/get_scraped_data'
|
64 |
-
try:
|
65 |
-
response = requests.get(url, headers=headers, params=params)
|
66 |
-
all_url = response.json()["URL"]
|
67 |
-
all_content[input_url] = response.json()["Content"]
|
68 |
-
return all_url
|
69 |
-
except:
|
70 |
-
print(f"found a error url : {input_url}=========================================")
|
71 |
-
return "none"
|
72 |
-
|
73 |
-
pdf_urls = []
|
74 |
-
|
75 |
-
def separate_pdf_and_nonPDF_links(urls):
|
76 |
-
# Separate URLs into two lists
|
77 |
-
pdf_links = [url for url in urls if url and url.endswith('.pdf')]
|
78 |
-
if pdf_links:
|
79 |
-
pdf_urls.append(pdf_links)
|
80 |
-
return [url for url in urls if not (url and url.endswith('.pdf'))] # other links for rescraping
|
81 |
-
|
82 |
-
def call_llm_service(scraped_data, input_url, input_query, pdf):
|
83 |
-
query = f"""
|
84 |
-
Here are my scraped links:
|
85 |
-
|
86 |
-
{scraped_data}
|
87 |
-
|
88 |
-
correct hostname: {input_url} use this host name for all other tasks
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
"""
|
93 |
-
llm = "ClaudeHaiku"
|
94 |
-
env = ""
|
95 |
-
user_id = "KAusXF7jp0Q40urdZWtDLXEhrmA"
|
96 |
-
thread_id = "hKxvoVgi7vRJCHhvMzH5"
|
97 |
-
stream_id = "stream1"
|
98 |
-
app_type = "sentinel"
|
99 |
-
other_request_params = {"messages": [
|
100 |
-
{"role": "user", "content": query},
|
101 |
-
]}
|
102 |
-
return AWSClaude(llm, env, user_id, thread_id, stream_id, app_type, other_request_params).invoke()
|
103 |
-
|
104 |
-
input_url = f'["{url}"]'
|
105 |
-
input_query = input_query
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
next_urls = []
|
110 |
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
111 |
-
futures = [executor.submit(scrapper, input_url) for input_url in (json.loads(input_url)[:2])]
|
112 |
-
for future in concurrent.futures.as_completed(futures):
|
113 |
-
next_urls.append(separate_pdf_and_nonPDF_links(future.result()))
|
114 |
-
if step==2:
|
115 |
-
break
|
116 |
-
selected_links_from_llm = call_llm_service(next_urls, input_url, input_query, "")
|
117 |
-
input_url = selected_links_from_llm
|
118 |
-
print(json.loads(input_url)[:2])
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
#return all_content.keys()
|
137 |
-
return all_content
|
138 |
-
|
139 |
-
|
140 |
|
141 |
#print(main("https://www.keells.com/", "Please analyse reports"))
|
142 |
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException, Request
|
2 |
from fastapi.responses import JSONResponse
|
3 |
from fastapi.middleware.cors import CORSMiddleware
|
4 |
+
|
5 |
+
|
6 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
# Load the tokenizer
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained("Arafath10/reference_page_finder")
|
11 |
+
|
12 |
+
# Load the model
|
13 |
+
model = AutoModelForSequenceClassification.from_pretrained("Arafath10/reference_page_finder")
|
14 |
+
|
15 |
|
16 |
app = FastAPI()
|
17 |
+
|
18 |
app.add_middleware(
|
19 |
CORSMiddleware,
|
20 |
allow_origins=["*"],
|
|
|
23 |
allow_headers=["*"],
|
24 |
)
|
25 |
|
26 |
+
@app.post("/find_refrence_page")
|
27 |
+
async def find_refrence_page(request: Request):
|
28 |
+
try:
|
29 |
+
# Extract the JSON body
|
30 |
+
body = await request.json()
|
31 |
+
test_text = body.get("text")
|
32 |
|
33 |
+
if not test_text:
|
34 |
+
raise HTTPException(status_code=400, detail="Missing 'text' field in request body")
|
35 |
|
36 |
+
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
# Remove all types of extra whitespace (spaces, tabs, newlines)
|
39 |
+
test_text = re.sub(r'\s+', ' ', test_text).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
+
def chunk_string(input_string, chunk_size):
|
42 |
+
return [input_string[i:i + chunk_size] for i in range(0, len(input_string), chunk_size)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
+
chunks = chunk_string(test_text, chunk_size=512)
|
45 |
+
chunks = reversed(chunks)
|
46 |
+
# Output the chunks
|
47 |
+
flag = "no reference found"
|
48 |
+
for idx, chunk in enumerate(chunks):
|
49 |
+
print(f"Chunk {idx + 1} {chunk}")
|
50 |
+
inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding="max_length")
|
51 |
+
outputs = model(**inputs)
|
52 |
+
predictions = np.argmax(outputs.logits.detach().numpy(), axis=-1)
|
53 |
+
#print("Prediction:", "yes reference found" if predictions[0] == 1 else "no reference found")
|
54 |
+
if predictions[0] == 1:
|
55 |
+
flag = "yes reference found"
|
56 |
+
break
|
57 |
+
return flag
|
58 |
+
except:
|
59 |
+
return "error"
|
|
|
|
|
|
|
|
|
60 |
|
61 |
#print(main("https://www.keells.com/", "Please analyse reports"))
|
62 |
|