Spaces:
Sleeping
Sleeping
from typing import Any, Literal | |
import httpx | |
import traceback | |
from mcp.server.fastmcp import FastMCP | |
# Initialize FastMCP server | |
mcp = FastMCP("arxiv-omar") | |
# Constants | |
CUSTOM_ARXIV_API_BASE = "https://om4r932-arxiv.hf.space" | |
DDG_API_BASE = "https://ychkhan-ptt-endpoints.hf.space" | |
API_3GPP_BASE = "https://organizedprogrammers-3gppdocfinder.hf.space" | |
# Helpers | |
async def make_request(url: str, data: dict = None) -> dict[str, Any] | None: | |
if data is None: | |
return None | |
headers = { | |
"Accept": "application/json" | |
} | |
async with httpx.AsyncClient(verify=False) as client: | |
try: | |
response = await client.post(url, headers=headers, json=data) | |
print(response) | |
response.raise_for_status() | |
return response.json() | |
except Exception as e: | |
traceback.print_exception(e) | |
return None | |
def format_search(pub_id: str, content: dict) -> str: | |
return f""" | |
arXiv publication ID : {pub_id} | |
Title : {content["title"]} | |
Authors : {content["authors"]} | |
Release Date : {content["date"]} | |
Abstract : {content["abstract"]} | |
PDF link : {content["pdf"]} | |
""" | |
def format_extract(message: dict) -> str: | |
return f""" | |
Title of PDF : {message.get("title", "No title has been found")} | |
Text : {message.get("text", "No text !")} | |
""" | |
def format_result_search(page: dict) -> str: | |
return f""" | |
Title : {page.get("title", "No titles found !")} | |
Little description : {page.get("body", "No description")} | |
PDF url : {page.get("url", None)} | |
""" | |
def format_3gpp_doc_result(result: dict, release: int = None) -> str: | |
return f""" | |
Document ID : {result.get("doc_id")} | |
Release version : {release if release is not None else "Not specified"} | |
URL : {result.get("url", "No URL found !")} | |
""" | |
# Tools | |
async def get_publications(keyword: str, limit: int = 15) -> str: | |
""" | |
Get arXiv publications based on keywords and limit of documents | |
Args: | |
keyword: Keywords separated by spaces | |
limit: Numbers of maximum publications returned (by default, 15) | |
""" | |
url = f"{CUSTOM_ARXIV_API_BASE}/search" | |
data = await make_request(url, data={'keyword': keyword, 'limit': limit}) | |
if data["error"]: | |
return data["message"] | |
if not data: | |
return "Unable to fetch publications" | |
if len(data["message"].keys()) == 0: | |
return "No publications found" | |
publications = [format_search(pub_id, content) for (pub_id, content) in data["message"].items()] | |
return "\n--\n".join(publications) | |
async def web_pdf_search(query: str) -> str: | |
""" | |
Search on the Web (with DuckDuckGo search engine) to get PDF documents based on the keywords | |
Args: | |
query: Keywords to search documents on the Web | |
""" | |
url = f"{DDG_API_BASE}/search" | |
data = await make_request(url, data={"query": query}) | |
if not data: | |
return "Unable to fetch results" | |
if len(data["results"]) == 0: | |
return "No results found" | |
results = [format_result_search(result) for result in data["results"]] | |
return "\n--\n".join(results) | |
async def get_3gpp_doc_url_byID(doc_id: str, release: int = None): | |
""" | |
Get 3GPP Technical Document URL by their document ID. | |
Args: | |
doc_id: Document ID (i.e. C4-125411, SP-551242, 31.101) | |
release : The release version of the document (by default, None) | |
""" | |
url = f"{API_3GPP_BASE}/find" | |
data = await make_request(url, data={"doc_id": doc_id, "release": release}) | |
if not data: | |
return "Unable to search document in 3GPP" | |
return format_3gpp_doc_result(data, release) | |
async def get_pdf_text(pdf_url: str, limit_page: int = -1) -> str: | |
""" | |
Extract the text from the URL pointing to a PDF file | |
Args: | |
pdf_url: URL to a PDF document | |
limit_page: How many pages the user wants to extract the content (default: -1 for all pages) | |
""" | |
url = f"{CUSTOM_ARXIV_API_BASE}/extract_pdf/url" | |
data = {"url": pdf_url} | |
if limit_page != -1: | |
data["page_num"] = limit_page | |
data = await make_request(url, data=data) | |
if data["error"]: | |
return data["message"] | |
if not data: | |
return "Unable to extract PDF text" | |
if len(data["message"].keys()) == 0: | |
return "No text can be extracted from this PDF" | |
return format_extract(data["message"]) | |
if __name__ == "__main__": | |
mcp.run(transport="stdio") |