File size: 4,588 Bytes
8227e25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from typing import Any, Literal
import httpx
import traceback
from mcp.server.fastmcp import FastMCP

# Initialize FastMCP server
mcp = FastMCP("arxiv-omar")

# Constants
CUSTOM_ARXIV_API_BASE = "https://om4r932-arxiv.hf.space"
DDG_API_BASE = "https://ychkhan-ptt-endpoints.hf.space"
API_3GPP_BASE = "https://organizedprogrammers-3gppdocfinder.hf.space"

# Helpers
async def make_request(url: str, data: dict = None) -> dict[str, Any] | None:
    if data is None:
        return None
    headers = {
        "Accept": "application/json"
    }
    async with httpx.AsyncClient(verify=False) as client:
        try:
            response = await client.post(url, headers=headers, json=data)
            print(response)
            response.raise_for_status()
            return response.json()
        except Exception as e:
            traceback.print_exception(e)
            return None
        
def format_search(pub_id: str, content: dict) -> str:
    return f"""
        arXiv publication ID : {pub_id}
        Title : {content["title"]}
        Authors : {content["authors"]}
        Release Date : {content["date"]}
        Abstract : {content["abstract"]}
        PDF link : {content["pdf"]}
    """

def format_extract(message: dict) -> str:
    return f"""
        Title of PDF : {message.get("title", "No title has been found")}
        Text : {message.get("text", "No text !")}
    """

def format_result_search(page: dict) -> str:
    return f"""
        Title : {page.get("title", "No titles found !")}
        Little description : {page.get("body", "No description")}
        PDF url : {page.get("url", None)}
    """

def format_3gpp_doc_result(result: dict, release: int = None) -> str:
    return f"""
        Document ID : {result.get("doc_id")}
        Release version : {release if release is not None else "Not specified"}
        URL : {result.get("url", "No URL found !")}
    """

# Tools
@mcp.tool()
async def get_publications(keyword: str, limit: int = 15) -> str:
    """
    Get arXiv publications based on keywords and limit of documents

    Args:
        keyword: Keywords separated by spaces
        limit: Numbers of maximum publications returned (by default, 15)
    """
    url = f"{CUSTOM_ARXIV_API_BASE}/search"
    data = await make_request(url, data={'keyword': keyword, 'limit': limit})
    if data["error"]:
        return data["message"]
    if not data:
        return "Unable to fetch publications"
    if len(data["message"].keys()) == 0:
        return "No publications found"
    
    publications = [format_search(pub_id, content) for (pub_id, content) in data["message"].items()]
    return "\n--\n".join(publications)

@mcp.tool()
async def web_pdf_search(query: str) -> str:
    """
    Search on the Web (with DuckDuckGo search engine) to get PDF documents based on the keywords

    Args:
        query: Keywords to search documents on the Web
    """
    
    url = f"{DDG_API_BASE}/search"
    data = await make_request(url, data={"query": query})
    if not data:
        return "Unable to fetch results"
    if len(data["results"]) == 0:
        return "No results found"
    
    results = [format_result_search(result) for result in data["results"]]
    return "\n--\n".join(results)

@mcp.tool()
async def get_3gpp_doc_url_byID(doc_id: str, release: int = None):
    """
    Get 3GPP Technical Document URL by their document ID.

    Args:
        doc_id: Document ID (i.e. C4-125411, SP-551242, 31.101)
        release : The release version of the document (by default, None)
    """
    url = f"{API_3GPP_BASE}/find"
    data = await make_request(url, data={"doc_id": doc_id, "release": release})
    if not data:
        return "Unable to search document in 3GPP"

    return format_3gpp_doc_result(data, release)

@mcp.tool()
async def get_pdf_text(pdf_url: str, limit_page: int = -1) -> str:
    """
    Extract the text from the URL pointing to a PDF file

    Args:
        pdf_url: URL to a PDF document
        limit_page: How many pages the user wants to extract the content (default: -1 for all pages)
    """

    url = f"{CUSTOM_ARXIV_API_BASE}/extract_pdf/url"
    data = {"url": pdf_url}
    if limit_page != -1:
        data["page_num"] = limit_page
    data = await make_request(url, data=data)
    if data["error"]:
        return data["message"]
    if not data:
        return "Unable to extract PDF text"
    if len(data["message"].keys()) == 0:
        return "No text can be extracted from this PDF"
    
    return format_extract(data["message"])

if __name__ == "__main__":
    mcp.run(transport="stdio")