Alfred828 commited on
Commit
7672dcd
·
verified ·
1 Parent(s): 10e6230

Create tools/wiki_search.py

Browse files
Files changed (1) hide show
  1. tools/wiki_search.py +132 -0
tools/wiki_search.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ import requests
4
+ from markdownify import markdownify
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class WikipediaOpensearchInput(BaseModel):
9
+ query: str = Field(
10
+ description="The search term or keyword to look up on English Wikipedia."
11
+ )
12
+
13
+
14
+ class GetPageTitleExcerptSectionsInput(BaseModel):
15
+ page_title: str = Field(description="The exact title of the Wikipedia page.")
16
+
17
+
18
+ class GetPageSectionContentInput(BaseModel):
19
+ page_title: str = Field(description="The exact Wikipedia article title.")
20
+ section_index: str = Field("The index of the section (from section metadata).")
21
+
22
+
23
+ class SearchWikipediaEn:
24
+ @staticmethod
25
+ def wikipedia_opensearch(query: str) -> str:
26
+ """
27
+ Searches for Wikipedia articles matching the given query using the OpenSearch API.
28
+
29
+ Args:
30
+ query (str): The search term or keyword to look up on English Wikipedia.
31
+
32
+ Returns:
33
+ str: JSON-list with items including matched titles, descriptions, and URLs.
34
+ Example:
35
+ [
36
+ 'Python programming',
37
+ ['Python (programming language)', ... ],
38
+ ['...', ...],
39
+ ['https://en.wikipedia.org/wiki/Python_(programming_language)', ...]
40
+ ]
41
+ """
42
+
43
+ url = "https://en.wikipedia.org/w/api.php"
44
+ params = {
45
+ "action": "opensearch",
46
+ "namespace": "0",
47
+ "search": query,
48
+ "limit": 10,
49
+ "format": "json",
50
+ }
51
+
52
+ response = requests.get(url=url, params=params)
53
+
54
+ return response.json()
55
+
56
+ @staticmethod
57
+ def get_page_title_excerpt_sections(page_title: str) -> dict[str, Any]:
58
+ """
59
+ Retrieves the title, summary excerpt (plain text), and section structure of a Wikipedia article.
60
+
61
+ Args:
62
+ page_title (str): The exact title of the Wikipedia page.
63
+
64
+ Returns:
65
+ dict: {
66
+ 'excerpt': (str) Plain text summary/excerpt of the page,
67
+ 'sections': (list) List of dictionaries describing section metadata.
68
+ }
69
+ Example:
70
+ {
71
+ "excerpt": "Python is a high-level programming language...",
72
+ "sections": [
73
+ {"toclevel": 1, "level": "2", "line": "History", "index": "1", ...},
74
+ ...
75
+ ]
76
+ }
77
+ """
78
+ url = "https://en.wikipedia.org/w/api.php"
79
+ params = {
80
+ "action": "query",
81
+ "prop": "extracts",
82
+ "titles": page_title,
83
+ "explaintext": True,
84
+ "format": "json",
85
+ "formatversion": 2,
86
+ }
87
+ req = requests.get(url, params=params)
88
+
89
+ page_title = req.json()["query"]["pages"][0]["title"]
90
+ page_excerpt = req.json()["query"]["pages"][0]["extract"]
91
+
92
+ url = "https://en.wikipedia.org/w/api.php"
93
+ params = {
94
+ "action": "parse",
95
+ "prop": "sections",
96
+ "page": page_title,
97
+ "format": "json",
98
+ }
99
+ rsp = requests.get(url, params=params)
100
+ page_sections = rsp.json()["parse"]["sections"]
101
+
102
+ return {
103
+ "excerpt": page_excerpt,
104
+ "sections": page_sections,
105
+ }
106
+
107
+ @staticmethod
108
+ def get_page_section_content(page_title: str, section_index: str) -> str:
109
+ """
110
+ Fetches the Markdown-formatted content of a specific section from a Wikipedia article.
111
+
112
+ Args:
113
+ page_title (str): The exact Wikipedia article title.
114
+ section_index (str): The index of the section (from section metadata).
115
+
116
+ Returns:
117
+ str: Markdown-formatted content of the specified section.
118
+ Example:
119
+ '# History\nPython was conceived in the late 1980s...'
120
+ """
121
+ url = "https://en.wikipedia.org/w/api.php"
122
+ params = {
123
+ "action": "parse",
124
+ "page": page_title,
125
+ "format": "json",
126
+ "prop": "text",
127
+ }
128
+ if section_index:
129
+ params["section"] = str(section_index)
130
+ rsp = requests.get(url, params=params)
131
+
132
+ return markdownify(rsp.json()["parse"]["text"]["*"])