|
from typing import Any, Literal |
|
import httpx |
|
from mcp.server.fastmcp import FastMCP |
|
|
|
|
|
mcp = FastMCP("arxiv-omar") |
|
|
|
|
|
CUSTOM_ARXIV_API_BASE = "https://om4r932-arxiv.hf.space" |
|
DDG_API_BASE = "https://ychkhan-ptt-endpoints.hf.space" |
|
|
|
|
|
async def make_request(url: str, data: dict = None) -> dict[str, Any] | None: |
|
if data is None: |
|
return None |
|
headers = { |
|
"Accept": "application/json" |
|
} |
|
async with httpx.AsyncClient(verify=False) as client: |
|
try: |
|
response = await client.post(url, headers=headers, json=data) |
|
print(response) |
|
response.raise_for_status() |
|
return response.json() |
|
except Exception as e: |
|
return None |
|
|
|
def format_search(pub_id: str, content: dict) -> str: |
|
return f""" |
|
arXiv publication ID : {pub_id} |
|
Title : {content["title"]} |
|
Authors : {content["authors"]} |
|
Release Date : {content["date"]} |
|
Abstract : {content["abstract"]} |
|
PDF link : {content["pdf"]} |
|
""" |
|
|
|
def format_extract(message: dict) -> str: |
|
return f""" |
|
Title of PDF : {message.get("title", "No title has been found")} |
|
Text : {message.get("text", "No text !")} |
|
""" |
|
|
|
def format_result_search(page: dict): |
|
return f""" |
|
Title : {page.get("title", "No titles found !")} |
|
Little description : {page.get("body", "No description")} |
|
PDF url : {page.get("url", None)} |
|
""" |
|
|
|
|
|
@mcp.tool() |
|
async def get_publications(keyword: str, limit: int = 15) -> str: |
|
""" |
|
Get arXiv publications based on keywords and limit of documents |
|
|
|
Args: |
|
keyword: Keywords separated by spaces |
|
limit: Numbers of maximum publications returned (by default, 15) |
|
""" |
|
url = f"{CUSTOM_ARXIV_API_BASE}/search" |
|
data = await make_request(url, data={'keyword': keyword, 'limit': limit}) |
|
if data["error"]: |
|
return data["message"] |
|
if not data: |
|
return "Unable to fetch publications" |
|
if len(data["message"].keys()) == 0: |
|
return "No publications found" |
|
|
|
publications = [format_search(pub_id, content) for (pub_id, content) in data["message"].items()] |
|
return "\n--\n".join(publications) |
|
|
|
@mcp.tool() |
|
async def web_pdf_search(query: str) -> str: |
|
""" |
|
Search on the Web (with DuckDuckGo search engine) to get PDF documents based on the keywords |
|
|
|
Args: |
|
query: Keywords to search documents on the Web |
|
""" |
|
|
|
url = f"{DDG_API_BASE}/search" |
|
data = await make_request(url, data={"query": query}) |
|
if not data: |
|
return "Unable to fetch results" |
|
if len(data["results"]) == 0: |
|
return "No results found" |
|
|
|
results = [format_result_search(result) for result in data["results"]] |
|
return "\n--\n".join(results) |
|
|
|
|
|
@mcp.tool() |
|
async def get_pdf_text(pdf_url: str, limit_page: int = -1) -> str: |
|
""" |
|
Extract the text from the URL pointing to a PDF file |
|
|
|
Args: |
|
pdf_url: URL to a PDF document |
|
limit_page: How many pages the user wants to extract the content (default: -1 for all pages) |
|
""" |
|
|
|
url = f"{CUSTOM_ARXIV_API_BASE}/extract_pdf/url" |
|
data = {"url": pdf_url} |
|
if limit_page != -1: |
|
data["page_num"] = limit_page |
|
data = await make_request(url, data=data) |
|
if data["error"]: |
|
return data["message"] |
|
if not data: |
|
return "Unable to extract PDF text" |
|
if len(data["message"].keys()) == 0: |
|
return "No text can be extracted from this PDF" |
|
|
|
return format_extract(data["message"]) |
|
|
|
if __name__ == "__main__": |
|
mcp.run(transport="stdio") |