Spaces:

phyloforfun
/

VoucherVision

Running

File size: 22,152 Bytes

import itertools, wikipediaapi, requests, re, json
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
# from langchain_community.tools.wikidata.tool import WikidataAPIWrapper, WikidataQueryRun
import cProfile
import pstats

class WikipediaLinks():


    def __init__(self, json_file_path_wiki) -> None:
        self.json_file_path_wiki = json_file_path_wiki
        self.wiki_wiki = wikipediaapi.Wikipedia(
            user_agent='VoucherVision ([email protected])',
            language='en'
        )
        self.property_to_rank = {
            'P225': 'Species',
            'P171': 'Family',
            'P105': 'Taxon rank',
            'P70': 'Genus',
            'P75': 'Clade',
            'P76': 'Subgenus',
            'P67': 'Subfamily',
            'P66': 'Tribe',
            'P71': 'Subtribe',
            'P61': 'Order',
            'P72': 'Suborder',
            'P73': 'Infraorder',
            'P74': 'Superfamily',
            'P142': 'Phylum',
            'P75': 'Clade',
            'P76': 'Subclass',
            'P77': 'Infraclass',
            'P78': 'Superorder',
            'P81': 'Class',
            'P82': 'Superclass',
            'P84': 'Kingdom',
            'P85': 'Superkingdom',
            'P86': 'Subkingdom',
            'P87': 'Infrakingdom',
            'P88': 'Parvkingdom',
            'P89': 'Domain',
            'P1421': 'GRIN',
            'P1070': 'KEW',
            'P5037': 'POWOID',
        }


    def get_label_for_entity_id(self, entity_id):
        url = "https://www.wikidata.org/w/api.php"
        params = {
            "action": "wbgetentities",
            "format": "json",
            "ids": entity_id,
            "props": "labels",
            "languages": "en"  # Assuming you want the label in English
        }
        response = requests.get(url, params=params)
        data = response.json()
        return data['entities'][entity_id]['labels']['en']['value'] if 'en' in data['entities'][entity_id]['labels'] else None


    def is_valid_url(self, url):
        try:
            response = requests.head(url, allow_redirects=True, timeout=5)
            # If the response status code is 200, the URL is reachable
            return response.status_code == 200
        except requests.RequestException as e:
            # If there was some issue with the request, such as the domain does not exist
            # print(f"URL {url} is not reachable. Error: {e}")
            return False
    
    # def get_infobar_data(self, wiki_page_title):
    #     # Step 1: Extract the Wikidata Item ID from the Wikipedia page
    #     wiki_api_url = "https://en.wikipedia.org/w/api.php"
    #     wiki_params = {
    #         "action": "query",
    #         "format": "json",
    #         "titles": wiki_page_title,
    #         "prop": "revisions",
    #         "rvprop": "content",
    #         "rvslots": "*"
    #     }

    #     wiki_response = requests.get(wiki_api_url, params=wiki_params)
    #     wiki_data = wiki_response.json()

    #     page_key = next(iter(wiki_data['query']['pages']))
    #     content = wiki_data['query']['pages'][page_key]['revisions'][0]['slots']['main']['*']

    #     infobox_pattern = re.compile(r'\{\{Infobox.*?\|title\}\}', re.DOTALL)    
    #     match = infobox_pattern.search(content)
    #     if match:
    #         wikidata_id =  match.group(1)  # Returns the full match including the 'Infobox' braces
    #     else:
    #         return "Infobox not found"

    #     # Step 2: Fetch Data from Wikidata Using the Extracted ID
    #     wikidata_api_url = "https://www.wikidata.org/w/api.php"
    #     wikidata_params = {
    #         "action": "wbgetentities",
    #         "format": "json",
    #         "ids": wikidata_id,
    #         "props": "claims"  # Adjust as needed to fetch the desired data
    #     }

    #     wikidata_response = requests.get(wikidata_api_url, params=wikidata_params)
    #     wikidata_content = wikidata_response.json()
        

    #     classification_full = {}
    #     classification = {}
    #     label_cache = {}  # Cache for labels


    #     # Turn this on to see the available properties to decode
    #     for prop_id, claims in wikidata_content['entities'][wikidata_id]['claims'].items():
    #         # Assuming the main snak value is what we want
    #         value = claims[0]['mainsnak']['datavalue']['value']
    #         if isinstance(value, dict):  # If the value is an entity ID
    #             # entity_id = value['id']
    #             # entity_id = value['id']
    #             if prop_id not in label_cache:
    #                 label_cache[prop_id] = self.get_label_for_entity_id(prop_id)
    #             classification_full[prop_id] = label_cache[prop_id]
    #         else:
    #             classification_full[prop_id] = value
    #     print(classification_full)
        # Map Wikidata properties to the corresponding taxonomic ranks

    def convert_to_decimal(self, coord_parts):
        lat_deg, lat_min, lat_dir, lon_deg, lon_min, lon_dir = coord_parts[:6]

        lat = float(lat_deg) + float(lat_min) / 60
        lon = float(lon_deg) + float(lon_min) / 60

        if lat_dir == 'S':
            lat = -lat
        if lon_dir == 'W':
            lon = -lon

        return f"{lat},{lon}"


    def extract_coordinates_and_region(self, coord_string):
        # Extract the coordinate parts and region info
        coord_parts = re.findall(r'(\d+|\w+)', coord_string)
        region_info = re.search(r'region:([^|]+)\|display', coord_string)

        if coord_parts and len(coord_parts) >= 6:
            # Convert to decimal coordinates
            decimal_coords = self.convert_to_decimal(coord_parts)
        else:
            decimal_coords = "Invalid coordinates format"

        region = region_info.group(1) if region_info else "Region not found"
        return decimal_coords, region
    

    def parse_infobox(self, infobox_string):
        # Split the string into lines
        lines = infobox_string.split('\n')

        # Dictionary to store the extracted data
        infobox_data = {}

        # Iterate over each line
        for line in lines:
            # Split the line into key and value
            parts = line.split('=', 1)

            # If the line is properly formatted with a key and value
            if len(parts) == 2:
                key = parts[0].strip()
                key = key.split(' ')[1]
                value = parts[1].strip()

                # Handling special cases like links or coordinates
                if value.startswith('[[') and value.endswith(']]'):
                    # Extracting linked article titles
                    value = value[2:-2].split('|')[0]
                elif value.startswith('{{coord') and value.endswith('}}'):
                    # Extracting coordinates
                    value = value[7:-2]
                elif value.startswith('[') and value.endswith(']') and ('http' in value):
                    value = value[1:-1]
                    url_parts = value.split(" ")
                    infobox_data['url_location'] = next((part for part in url_parts if 'http' in part), None)

                if key == 'coordinates':
                    decimal_coordinates, region = self.extract_coordinates_and_region(value)
                    infobox_data['region'] = region
                    infobox_data['decimal_coordinates'] = decimal_coordinates

                key = self.sanitize(key)
                value = self.sanitize(value)
                value = self.remove_html_and_wiki_markup(value)
                # Add to dictionary
                infobox_data[key] = value

        return infobox_data

    def get_infobox_data(self, wiki_page_title, opt=None):
        wiki_api_url = "https://en.wikipedia.org/w/api.php"
        wiki_params = {
            "action": "query",
            "format": "json",
            "titles": wiki_page_title,
            "prop": "revisions",
            "rvprop": "content",
            "rvslots": "*"
        }

        try:
            wiki_response = requests.get(wiki_api_url, params=wiki_params)
            wiki_response.raise_for_status()  # Check for HTTP errors
        except requests.RequestException as e:
            return f"Error fetching data: {e}"

        wiki_data = wiki_response.json()

        page_key = next(iter(wiki_data['query']['pages']), None)
        if page_key is None or "missing" in wiki_data['query']['pages'][page_key]:
            return "Page not found"

        content = wiki_data['query']['pages'][page_key]['revisions'][0]['slots']['main']['*']

        infobox_pattern = re.compile(r'\{\{Infobox.*?\}\}', re.DOTALL)
        match = infobox_pattern.search(content)
        
        if match:
            infobox_content = match.group()
        else:
            self.infobox_data = {}
            self.infobox_data_locality = {}
            return "Infobox not found"

        if opt is None:
            self.infobox_data = self.parse_infobox(infobox_content)
        else:
            self.infobox_data_locality = self.parse_infobox(infobox_content)



        # Example usage

        # for prop_id, claims in wikidata_content['entities'][wikidata_id]['claims'].items():
        #     # Get the taxonomic rank from the mapping
        #     rank = self.property_to_rank.get(prop_id)
        #     if rank:
        #         value = claims[0]['mainsnak']['datavalue']['value']
        #         if isinstance(value, dict):  # If the value is an entity ID
        #             entity_id = value['id']
        #             if entity_id not in label_cache:
        #                 label_cache[entity_id] = self.get_label_for_entity_id(entity_id)
        #             classification[rank] = label_cache[entity_id]
        #         else:
        #             classification[rank] = value

        # try:
        #     unknown_link = "https://powo.science.kew.org/taxon/" + classification['POWOID']
        #     if self.is_valid_url(unknown_link):
        #         classification['POWOID'] = unknown_link
        #         classification['POWOID_syn'] = unknown_link + '#synonyms'
        # except:
        #     pass
        # return classification



    def get_taxonbar_data(self, wiki_page_title):
        # Step 1: Extract the Wikidata Item ID from the Wikipedia page
        wiki_api_url = "https://en.wikipedia.org/w/api.php"
        wiki_params = {
            "action": "query",
            "format": "json",
            "titles": wiki_page_title,
            "prop": "revisions",
            "rvprop": "content",
            "rvslots": "*"
        }

        wiki_response = requests.get(wiki_api_url, params=wiki_params)
        wiki_data = wiki_response.json()

        page_key = next(iter(wiki_data['query']['pages']))
        content = wiki_data['query']['pages'][page_key]['revisions'][0]['slots']['main']['*']

        taxonbar_match = re.search(r'\{\{Taxonbar\|from=(Q\d+)\}\}', content)
        if not taxonbar_match:
            return "Taxonbar not found"

        wikidata_id = taxonbar_match.group(1)

        # Step 2: Fetch Data from Wikidata Using the Extracted ID
        wikidata_api_url = "https://www.wikidata.org/w/api.php"
        wikidata_params = {
            "action": "wbgetentities",
            "format": "json",
            "ids": wikidata_id,
            "props": "claims"  # Adjust as needed to fetch the desired data
        }

        wikidata_response = requests.get(wikidata_api_url, params=wikidata_params)
        wikidata_content = wikidata_response.json()
        

        classification_full = {}
        classification = {}
        label_cache = {}  # Cache for labels


        # Turn this on to see the available properties to decode
        # for prop_id, claims in wikidata_content['entities'][wikidata_id]['claims'].items():
        #     # Assuming the main snak value is what we want
        #     value = claims[0]['mainsnak']['datavalue']['value']
        #     if isinstance(value, dict):  # If the value is an entity ID
        #         # entity_id = value['id']
        #         # entity_id = value['id']
        #         if prop_id not in label_cache:
        #             label_cache[prop_id] = self.get_label_for_entity_id(prop_id)
        #         classification_full[prop_id] = label_cache[prop_id]
        #     else:
        #         classification_full[prop_id] = value
        # print(classification_full)
        # Map Wikidata properties to the corresponding taxonomic ranks
        

        for prop_id, claims in wikidata_content['entities'][wikidata_id]['claims'].items():
            # Get the taxonomic rank from the mapping
            rank = self.property_to_rank.get(prop_id)
            if rank:
                value = claims[0]['mainsnak']['datavalue']['value']
                if isinstance(value, dict):  # If the value is an entity ID
                    entity_id = value['id']
                    if entity_id not in label_cache:
                        label_cache[entity_id] = self.get_label_for_entity_id(entity_id)
                    classification[rank] = label_cache[entity_id]
                else:
                    classification[rank] = value

        try:
            unknown_link = "https://powo.science.kew.org/taxon/" + classification['POWOID']
            if self.is_valid_url(unknown_link):
                classification['POWOID'] = unknown_link
                classification['POWOID_syn'] = unknown_link + '#synonyms'
        except:
            pass
        return classification


    def extract_page_title(self, result_string):
        first_line = result_string.split('\n')[0]
        page_title = first_line.replace('Page: ', '').strip()
        return page_title


    def get_wikipedia_url(self, page_title):
        page = self.wiki_wiki.page(page_title)
        if page.exists():
            return page.fullurl
        else:
            return None


    def extract_info_taxa(self, page):
        links = []
        self.info_packet['WIKI_TAXA']['LINKS'] = {}
        self.info_packet['WIKI_TAXA']['DATA'] = {}

        self.info_packet['WIKI_TAXA']['DATA'].update(self.get_taxonbar_data(page.title))

        # for back in page.backlinks:
        #     back = self.sanitize(back) 
        #     if ':' not in back:
        #         link = self.sanitize(self.get_wikipedia_url(back))
        #         if link not in links:
        #             links.append(link)
        #             self.info_packet['WIKI_TAXA']['LINKS'][back] = link


    def extract_info_geo(self, page, opt=None):
        links = []
        self.info_packet['WIKI_GEO']['LINKS'] = {}
        if opt is None:
            self.get_infobox_data(page.title)
        else:
            self.get_infobox_data(page.title,opt=opt)

        for back in itertools.islice(page.backlinks, 10):  
            back = self.sanitize(back) 
            if ':' not in back:
                link = self.sanitize(self.get_wikipedia_url(back))
                if link not in links:
                    links.append(link)
                    self.info_packet['WIKI_GEO']['LINKS'][back] = link


    def gather_geo(self, query,opt=None):
        if opt is None:
            self.info_packet['WIKI_GEO']['DATA'] = {}
        else:
            self.info_packet['WIKI_LOCALITY']['DATA'] = {}
            
        wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())

        result = wikipedia.run(query)
        summary = result.split('Summary:')[1]
        summary = self.sanitize(summary)
        # print(result)
        page_title = self.extract_page_title(result)

        page = self.wiki_wiki.page(page_title)

        # Do these first, they are less likely to fail
        if opt is None:
            self.info_packet['WIKI_GEO']['PAGE_LINK'] = self.get_wikipedia_url(page_title)
            self.info_packet['WIKI_GEO']['PAGE_TITLE'] = page_title
            self.info_packet['WIKI_GEO']['SUMMARY'] = summary

        else:
            self.info_packet['WIKI_LOCALITY']['PAGE_TITLE'] = page_title
            self.info_packet['WIKI_LOCALITY']['PAGE_LINK'] = self.get_wikipedia_url(page_title)
            self.info_packet['WIKI_LOCALITY']['SUMMARY'] = summary


        # Check if the page exists, get the more complex data. Do it last in case of failure ########################## This might not be useful enough to justify the time
        # if page.exists():
        #     if opt is None:
        #         self.extract_info_geo(page)
        #     else:
        #         self.extract_info_geo(page, opt=opt)

        if opt is None:
            self.info_packet['WIKI_GEO']['DATA'].update(self.infobox_data)
        else:
            self.info_packet['WIKI_LOCALITY']['DATA'].update(self.infobox_data_locality)


    def gather_taxonomy(self, query):
        wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())

        # query = "Tracaulon sagittatum Tracaulon sagittatum"
        result = wikipedia.run(query)
        summary = result.split('Summary:')[1]
        summary = self.sanitize(summary)
        # print(result)
        page_title = self.extract_page_title(result)

        page = self.wiki_wiki.page(page_title)

        # Check if the page exists
        if page.exists():
            self.extract_info_taxa(page)

        self.info_packet['WIKI_TAXA']['PAGE_TITLE'] = page_title
        self.info_packet['WIKI_TAXA']['PAGE_LINK'] = self.get_wikipedia_url(page_title)
        self.info_packet['WIKI_TAXA']['SUMMARY'] = summary
        return self.info_packet 
    

    def gather_wikipedia_results(self, output):
        self.info_packet = {}
        self.info_packet['WIKI_TAXA'] = {}
        self.info_packet['WIKI_GEO'] = {}
        self.info_packet['WIKI_LOCALITY'] = {}

        municipality = output.get('municipality','')
        county = output.get('county','')
        stateProvince = output.get('stateProvince','')
        country = output.get('country','')

        locality = output.get('locality','')

        order = output.get('order','')
        family = output.get('family','')
        scientificName = output.get('scientificName','')
        genus = output.get('genus','')
        specificEpithet = output.get('specificEpithet','')


        query_geo = ' '.join([municipality, county, stateProvince, country]).strip()
        query_locality = locality.strip()
        query_taxa_primary = scientificName.strip()
        query_taxa_secondary = ' '.join([genus, specificEpithet]).strip()
        query_taxa_tertiary = ' '.join([order, family, genus, specificEpithet]).strip()

        # query_taxa = "Tracaulon sagittatum Tracaulon sagittatum"
        # query_geo = "Indiana Porter Co."
        # query_locality = "Mical Springs edge"
        
        if query_geo:
            try:
                self.gather_geo(query_geo)
            except:
                pass
        
        if query_locality:
            try:
                self.gather_geo(query_locality,'locality')
            except:
                pass
        
        queries_taxa = [query_taxa_primary, query_taxa_secondary, query_taxa_tertiary]
        for q in queries_taxa:
            if q:
                try:
                    self.gather_taxonomy(q)
                    break
                except:
                    pass

        # print(self.info_packet)
        # return self.info_packet
        # self.gather_geo(query_geo)
        try:
            with open(self.json_file_path_wiki, 'w', encoding='utf-8') as file:
                json.dump(self.info_packet, file, indent=4)
        except:
            sanitized_data = self.sanitize(self.info_packet)
            with open(self.json_file_path_wiki, 'w', encoding='utf-8') as file:
                json.dump(sanitized_data, file, indent=4)
        
        
    def sanitize(self, data):
        if isinstance(data, dict):
            return {self.sanitize(key): self.sanitize(value) for key, value in data.items()}
        elif isinstance(data, list):
            return [self.sanitize(element) for element in data]
        elif isinstance(data, str):
            return data.encode('utf-8', 'ignore').decode('utf-8')
        else:
            return data
  
    def remove_html_and_wiki_markup(self, text):
        # Remove HTML tags
        clean_text = re.sub(r'<.*?>', '', text)

        # Remove Wiki links but keep the text inside
        # For example, '[[Greg Abbott]]' becomes 'Greg Abbott'
        clean_text = re.sub(r'\[\[(?:[^\]|]*\|)?([^\]|]*)\]\]', r'\1', clean_text)

        # Remove Wiki template markup, e.g., '{{nowrap|text}}' becomes 'text'
        clean_text = re.sub(r'\{\{(?:[^\}|]*\|)?([^\}|]*)\}\}', r'\1', clean_text)

        return clean_text


if __name__ == '__main__':
    test_output = {
    "filename": "MICH_7375774_Polygonaceae_Persicaria_",
    "catalogNumber": "1439649",
    "order": "",
    "family": "",
    "scientificName": "Tracaulon sagittatum",
    "scientificNameAuthorship": "",
    "genus": "Tracaulon",
    "subgenus": "",
    "specificEpithet": "sagittatum",
    "infraspecificEpithet": "",
    "identifiedBy": "",
    "recordedBy": "Marcus W. Lyon, Jr.",
    "recordNumber": "TX 11",
    "verbatimEventDate": "1927",
    "eventDate": "1927-00-00",
    "habitat": "wet subdunal woods",
    "occurrenceRemarks": "Flowers pink",
    "country": "Indiana",
    "stateProvince": "Porter Co.",
    "county": "",
    "municipality": "",
    "locality": "Mical Springs edge",
    "degreeOfEstablishment": "",
    "decimalLatitude": "",
    "decimalLongitude": "",
    "verbatimCoordinates": "",
    "minimumElevationInMeters": "",
    "maximumElevationInMeters": ""
    }
    do_print_profiler = True
    if do_print_profiler:
        profiler = cProfile.Profile()
        profiler.enable()
    
    Wiki = WikipediaLinks('D:/D_Desktop/usda_pdf/test.json')
    info_packet= Wiki.gather_wikipedia_results(test_output)

    if do_print_profiler:
        profiler.disable()
        stats = pstats.Stats(profiler).sort_stats('cumulative')
        stats.print_stats(50)