VoucherVision / vouchervision /tool_wikipedia.py
phyloforfun's picture
Major update. Support for 15 LLMs, World Flora Online taxonomy validation, geolocation, 2 OCR methods, significant UI changes, stability improvements, consistent JSON parsing
1d9ab62
raw
history blame
22.2 kB
import itertools, wikipediaapi, requests, re, json
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
# from langchain_community.tools.wikidata.tool import WikidataAPIWrapper, WikidataQueryRun
import cProfile
import pstats
class WikipediaLinks():
def __init__(self, json_file_path_wiki) -> None:
self.json_file_path_wiki = json_file_path_wiki
self.wiki_wiki = wikipediaapi.Wikipedia(
user_agent='VoucherVision ([email protected])',
language='en'
)
self.property_to_rank = {
'P225': 'Species',
'P171': 'Family',
'P105': 'Taxon rank',
'P70': 'Genus',
'P75': 'Clade',
'P76': 'Subgenus',
'P67': 'Subfamily',
'P66': 'Tribe',
'P71': 'Subtribe',
'P61': 'Order',
'P72': 'Suborder',
'P73': 'Infraorder',
'P74': 'Superfamily',
'P142': 'Phylum',
'P75': 'Clade',
'P76': 'Subclass',
'P77': 'Infraclass',
'P78': 'Superorder',
'P81': 'Class',
'P82': 'Superclass',
'P84': 'Kingdom',
'P85': 'Superkingdom',
'P86': 'Subkingdom',
'P87': 'Infrakingdom',
'P88': 'Parvkingdom',
'P89': 'Domain',
'P1421': 'GRIN',
'P1070': 'KEW',
'P5037': 'POWOID',
}
def get_label_for_entity_id(self, entity_id):
url = "https://www.wikidata.org/w/api.php"
params = {
"action": "wbgetentities",
"format": "json",
"ids": entity_id,
"props": "labels",
"languages": "en" # Assuming you want the label in English
}
response = requests.get(url, params=params)
data = response.json()
return data['entities'][entity_id]['labels']['en']['value'] if 'en' in data['entities'][entity_id]['labels'] else None
def is_valid_url(self, url):
try:
response = requests.head(url, allow_redirects=True, timeout=5)
# If the response status code is 200, the URL is reachable
return response.status_code == 200
except requests.RequestException as e:
# If there was some issue with the request, such as the domain does not exist
# print(f"URL {url} is not reachable. Error: {e}")
return False
# def get_infobar_data(self, wiki_page_title):
# # Step 1: Extract the Wikidata Item ID from the Wikipedia page
# wiki_api_url = "https://en.wikipedia.org/w/api.php"
# wiki_params = {
# "action": "query",
# "format": "json",
# "titles": wiki_page_title,
# "prop": "revisions",
# "rvprop": "content",
# "rvslots": "*"
# }
# wiki_response = requests.get(wiki_api_url, params=wiki_params)
# wiki_data = wiki_response.json()
# page_key = next(iter(wiki_data['query']['pages']))
# content = wiki_data['query']['pages'][page_key]['revisions'][0]['slots']['main']['*']
# infobox_pattern = re.compile(r'\{\{Infobox.*?\|title\}\}', re.DOTALL)
# match = infobox_pattern.search(content)
# if match:
# wikidata_id = match.group(1) # Returns the full match including the 'Infobox' braces
# else:
# return "Infobox not found"
# # Step 2: Fetch Data from Wikidata Using the Extracted ID
# wikidata_api_url = "https://www.wikidata.org/w/api.php"
# wikidata_params = {
# "action": "wbgetentities",
# "format": "json",
# "ids": wikidata_id,
# "props": "claims" # Adjust as needed to fetch the desired data
# }
# wikidata_response = requests.get(wikidata_api_url, params=wikidata_params)
# wikidata_content = wikidata_response.json()
# classification_full = {}
# classification = {}
# label_cache = {} # Cache for labels
# # Turn this on to see the available properties to decode
# for prop_id, claims in wikidata_content['entities'][wikidata_id]['claims'].items():
# # Assuming the main snak value is what we want
# value = claims[0]['mainsnak']['datavalue']['value']
# if isinstance(value, dict): # If the value is an entity ID
# # entity_id = value['id']
# # entity_id = value['id']
# if prop_id not in label_cache:
# label_cache[prop_id] = self.get_label_for_entity_id(prop_id)
# classification_full[prop_id] = label_cache[prop_id]
# else:
# classification_full[prop_id] = value
# print(classification_full)
# Map Wikidata properties to the corresponding taxonomic ranks
def convert_to_decimal(self, coord_parts):
lat_deg, lat_min, lat_dir, lon_deg, lon_min, lon_dir = coord_parts[:6]
lat = float(lat_deg) + float(lat_min) / 60
lon = float(lon_deg) + float(lon_min) / 60
if lat_dir == 'S':
lat = -lat
if lon_dir == 'W':
lon = -lon
return f"{lat},{lon}"
def extract_coordinates_and_region(self, coord_string):
# Extract the coordinate parts and region info
coord_parts = re.findall(r'(\d+|\w+)', coord_string)
region_info = re.search(r'region:([^|]+)\|display', coord_string)
if coord_parts and len(coord_parts) >= 6:
# Convert to decimal coordinates
decimal_coords = self.convert_to_decimal(coord_parts)
else:
decimal_coords = "Invalid coordinates format"
region = region_info.group(1) if region_info else "Region not found"
return decimal_coords, region
def parse_infobox(self, infobox_string):
# Split the string into lines
lines = infobox_string.split('\n')
# Dictionary to store the extracted data
infobox_data = {}
# Iterate over each line
for line in lines:
# Split the line into key and value
parts = line.split('=', 1)
# If the line is properly formatted with a key and value
if len(parts) == 2:
key = parts[0].strip()
key = key.split(' ')[1]
value = parts[1].strip()
# Handling special cases like links or coordinates
if value.startswith('[[') and value.endswith(']]'):
# Extracting linked article titles
value = value[2:-2].split('|')[0]
elif value.startswith('{{coord') and value.endswith('}}'):
# Extracting coordinates
value = value[7:-2]
elif value.startswith('[') and value.endswith(']') and ('http' in value):
value = value[1:-1]
url_parts = value.split(" ")
infobox_data['url_location'] = next((part for part in url_parts if 'http' in part), None)
if key == 'coordinates':
decimal_coordinates, region = self.extract_coordinates_and_region(value)
infobox_data['region'] = region
infobox_data['decimal_coordinates'] = decimal_coordinates
key = self.sanitize(key)
value = self.sanitize(value)
value = self.remove_html_and_wiki_markup(value)
# Add to dictionary
infobox_data[key] = value
return infobox_data
def get_infobox_data(self, wiki_page_title, opt=None):
wiki_api_url = "https://en.wikipedia.org/w/api.php"
wiki_params = {
"action": "query",
"format": "json",
"titles": wiki_page_title,
"prop": "revisions",
"rvprop": "content",
"rvslots": "*"
}
try:
wiki_response = requests.get(wiki_api_url, params=wiki_params)
wiki_response.raise_for_status() # Check for HTTP errors
except requests.RequestException as e:
return f"Error fetching data: {e}"
wiki_data = wiki_response.json()
page_key = next(iter(wiki_data['query']['pages']), None)
if page_key is None or "missing" in wiki_data['query']['pages'][page_key]:
return "Page not found"
content = wiki_data['query']['pages'][page_key]['revisions'][0]['slots']['main']['*']
infobox_pattern = re.compile(r'\{\{Infobox.*?\}\}', re.DOTALL)
match = infobox_pattern.search(content)
if match:
infobox_content = match.group()
else:
self.infobox_data = {}
self.infobox_data_locality = {}
return "Infobox not found"
if opt is None:
self.infobox_data = self.parse_infobox(infobox_content)
else:
self.infobox_data_locality = self.parse_infobox(infobox_content)
# Example usage
# for prop_id, claims in wikidata_content['entities'][wikidata_id]['claims'].items():
# # Get the taxonomic rank from the mapping
# rank = self.property_to_rank.get(prop_id)
# if rank:
# value = claims[0]['mainsnak']['datavalue']['value']
# if isinstance(value, dict): # If the value is an entity ID
# entity_id = value['id']
# if entity_id not in label_cache:
# label_cache[entity_id] = self.get_label_for_entity_id(entity_id)
# classification[rank] = label_cache[entity_id]
# else:
# classification[rank] = value
# try:
# unknown_link = "https://powo.science.kew.org/taxon/" + classification['POWOID']
# if self.is_valid_url(unknown_link):
# classification['POWOID'] = unknown_link
# classification['POWOID_syn'] = unknown_link + '#synonyms'
# except:
# pass
# return classification
def get_taxonbar_data(self, wiki_page_title):
# Step 1: Extract the Wikidata Item ID from the Wikipedia page
wiki_api_url = "https://en.wikipedia.org/w/api.php"
wiki_params = {
"action": "query",
"format": "json",
"titles": wiki_page_title,
"prop": "revisions",
"rvprop": "content",
"rvslots": "*"
}
wiki_response = requests.get(wiki_api_url, params=wiki_params)
wiki_data = wiki_response.json()
page_key = next(iter(wiki_data['query']['pages']))
content = wiki_data['query']['pages'][page_key]['revisions'][0]['slots']['main']['*']
taxonbar_match = re.search(r'\{\{Taxonbar\|from=(Q\d+)\}\}', content)
if not taxonbar_match:
return "Taxonbar not found"
wikidata_id = taxonbar_match.group(1)
# Step 2: Fetch Data from Wikidata Using the Extracted ID
wikidata_api_url = "https://www.wikidata.org/w/api.php"
wikidata_params = {
"action": "wbgetentities",
"format": "json",
"ids": wikidata_id,
"props": "claims" # Adjust as needed to fetch the desired data
}
wikidata_response = requests.get(wikidata_api_url, params=wikidata_params)
wikidata_content = wikidata_response.json()
classification_full = {}
classification = {}
label_cache = {} # Cache for labels
# Turn this on to see the available properties to decode
# for prop_id, claims in wikidata_content['entities'][wikidata_id]['claims'].items():
# # Assuming the main snak value is what we want
# value = claims[0]['mainsnak']['datavalue']['value']
# if isinstance(value, dict): # If the value is an entity ID
# # entity_id = value['id']
# # entity_id = value['id']
# if prop_id not in label_cache:
# label_cache[prop_id] = self.get_label_for_entity_id(prop_id)
# classification_full[prop_id] = label_cache[prop_id]
# else:
# classification_full[prop_id] = value
# print(classification_full)
# Map Wikidata properties to the corresponding taxonomic ranks
for prop_id, claims in wikidata_content['entities'][wikidata_id]['claims'].items():
# Get the taxonomic rank from the mapping
rank = self.property_to_rank.get(prop_id)
if rank:
value = claims[0]['mainsnak']['datavalue']['value']
if isinstance(value, dict): # If the value is an entity ID
entity_id = value['id']
if entity_id not in label_cache:
label_cache[entity_id] = self.get_label_for_entity_id(entity_id)
classification[rank] = label_cache[entity_id]
else:
classification[rank] = value
try:
unknown_link = "https://powo.science.kew.org/taxon/" + classification['POWOID']
if self.is_valid_url(unknown_link):
classification['POWOID'] = unknown_link
classification['POWOID_syn'] = unknown_link + '#synonyms'
except:
pass
return classification
def extract_page_title(self, result_string):
first_line = result_string.split('\n')[0]
page_title = first_line.replace('Page: ', '').strip()
return page_title
def get_wikipedia_url(self, page_title):
page = self.wiki_wiki.page(page_title)
if page.exists():
return page.fullurl
else:
return None
def extract_info_taxa(self, page):
links = []
self.info_packet['WIKI_TAXA']['LINKS'] = {}
self.info_packet['WIKI_TAXA']['DATA'] = {}
self.info_packet['WIKI_TAXA']['DATA'].update(self.get_taxonbar_data(page.title))
# for back in page.backlinks:
# back = self.sanitize(back)
# if ':' not in back:
# link = self.sanitize(self.get_wikipedia_url(back))
# if link not in links:
# links.append(link)
# self.info_packet['WIKI_TAXA']['LINKS'][back] = link
def extract_info_geo(self, page, opt=None):
links = []
self.info_packet['WIKI_GEO']['LINKS'] = {}
if opt is None:
self.get_infobox_data(page.title)
else:
self.get_infobox_data(page.title,opt=opt)
for back in itertools.islice(page.backlinks, 10):
back = self.sanitize(back)
if ':' not in back:
link = self.sanitize(self.get_wikipedia_url(back))
if link not in links:
links.append(link)
self.info_packet['WIKI_GEO']['LINKS'][back] = link
def gather_geo(self, query,opt=None):
if opt is None:
self.info_packet['WIKI_GEO']['DATA'] = {}
else:
self.info_packet['WIKI_LOCALITY']['DATA'] = {}
wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
result = wikipedia.run(query)
summary = result.split('Summary:')[1]
summary = self.sanitize(summary)
# print(result)
page_title = self.extract_page_title(result)
page = self.wiki_wiki.page(page_title)
# Do these first, they are less likely to fail
if opt is None:
self.info_packet['WIKI_GEO']['PAGE_LINK'] = self.get_wikipedia_url(page_title)
self.info_packet['WIKI_GEO']['PAGE_TITLE'] = page_title
self.info_packet['WIKI_GEO']['SUMMARY'] = summary
else:
self.info_packet['WIKI_LOCALITY']['PAGE_TITLE'] = page_title
self.info_packet['WIKI_LOCALITY']['PAGE_LINK'] = self.get_wikipedia_url(page_title)
self.info_packet['WIKI_LOCALITY']['SUMMARY'] = summary
# Check if the page exists, get the more complex data. Do it last in case of failure ########################## This might not be useful enough to justify the time
# if page.exists():
# if opt is None:
# self.extract_info_geo(page)
# else:
# self.extract_info_geo(page, opt=opt)
if opt is None:
self.info_packet['WIKI_GEO']['DATA'].update(self.infobox_data)
else:
self.info_packet['WIKI_LOCALITY']['DATA'].update(self.infobox_data_locality)
def gather_taxonomy(self, query):
wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
# query = "Tracaulon sagittatum Tracaulon sagittatum"
result = wikipedia.run(query)
summary = result.split('Summary:')[1]
summary = self.sanitize(summary)
# print(result)
page_title = self.extract_page_title(result)
page = self.wiki_wiki.page(page_title)
# Check if the page exists
if page.exists():
self.extract_info_taxa(page)
self.info_packet['WIKI_TAXA']['PAGE_TITLE'] = page_title
self.info_packet['WIKI_TAXA']['PAGE_LINK'] = self.get_wikipedia_url(page_title)
self.info_packet['WIKI_TAXA']['SUMMARY'] = summary
return self.info_packet
def gather_wikipedia_results(self, output):
self.info_packet = {}
self.info_packet['WIKI_TAXA'] = {}
self.info_packet['WIKI_GEO'] = {}
self.info_packet['WIKI_LOCALITY'] = {}
municipality = output.get('municipality','')
county = output.get('county','')
stateProvince = output.get('stateProvince','')
country = output.get('country','')
locality = output.get('locality','')
order = output.get('order','')
family = output.get('family','')
scientificName = output.get('scientificName','')
genus = output.get('genus','')
specificEpithet = output.get('specificEpithet','')
query_geo = ' '.join([municipality, county, stateProvince, country]).strip()
query_locality = locality.strip()
query_taxa_primary = scientificName.strip()
query_taxa_secondary = ' '.join([genus, specificEpithet]).strip()
query_taxa_tertiary = ' '.join([order, family, genus, specificEpithet]).strip()
# query_taxa = "Tracaulon sagittatum Tracaulon sagittatum"
# query_geo = "Indiana Porter Co."
# query_locality = "Mical Springs edge"
if query_geo:
try:
self.gather_geo(query_geo)
except:
pass
if query_locality:
try:
self.gather_geo(query_locality,'locality')
except:
pass
queries_taxa = [query_taxa_primary, query_taxa_secondary, query_taxa_tertiary]
for q in queries_taxa:
if q:
try:
self.gather_taxonomy(q)
break
except:
pass
# print(self.info_packet)
# return self.info_packet
# self.gather_geo(query_geo)
try:
with open(self.json_file_path_wiki, 'w', encoding='utf-8') as file:
json.dump(self.info_packet, file, indent=4)
except:
sanitized_data = self.sanitize(self.info_packet)
with open(self.json_file_path_wiki, 'w', encoding='utf-8') as file:
json.dump(sanitized_data, file, indent=4)
def sanitize(self, data):
if isinstance(data, dict):
return {self.sanitize(key): self.sanitize(value) for key, value in data.items()}
elif isinstance(data, list):
return [self.sanitize(element) for element in data]
elif isinstance(data, str):
return data.encode('utf-8', 'ignore').decode('utf-8')
else:
return data
def remove_html_and_wiki_markup(self, text):
# Remove HTML tags
clean_text = re.sub(r'<.*?>', '', text)
# Remove Wiki links but keep the text inside
# For example, '[[Greg Abbott]]' becomes 'Greg Abbott'
clean_text = re.sub(r'\[\[(?:[^\]|]*\|)?([^\]|]*)\]\]', r'\1', clean_text)
# Remove Wiki template markup, e.g., '{{nowrap|text}}' becomes 'text'
clean_text = re.sub(r'\{\{(?:[^\}|]*\|)?([^\}|]*)\}\}', r'\1', clean_text)
return clean_text
if __name__ == '__main__':
test_output = {
"filename": "MICH_7375774_Polygonaceae_Persicaria_",
"catalogNumber": "1439649",
"order": "",
"family": "",
"scientificName": "Tracaulon sagittatum",
"scientificNameAuthorship": "",
"genus": "Tracaulon",
"subgenus": "",
"specificEpithet": "sagittatum",
"infraspecificEpithet": "",
"identifiedBy": "",
"recordedBy": "Marcus W. Lyon, Jr.",
"recordNumber": "TX 11",
"verbatimEventDate": "1927",
"eventDate": "1927-00-00",
"habitat": "wet subdunal woods",
"occurrenceRemarks": "Flowers pink",
"country": "Indiana",
"stateProvince": "Porter Co.",
"county": "",
"municipality": "",
"locality": "Mical Springs edge",
"degreeOfEstablishment": "",
"decimalLatitude": "",
"decimalLongitude": "",
"verbatimCoordinates": "",
"minimumElevationInMeters": "",
"maximumElevationInMeters": ""
}
do_print_profiler = True
if do_print_profiler:
profiler = cProfile.Profile()
profiler.enable()
Wiki = WikipediaLinks('D:/D_Desktop/usda_pdf/test.json')
info_packet= Wiki.gather_wikipedia_results(test_output)
if do_print_profiler:
profiler.disable()
stats = pstats.Stats(profiler).sort_stats('cumulative')
stats.print_stats(50)