import gradio as gr from transformers import pipeline from datasets import load_dataset import requests from bs4 import BeautifulSoup import random classification_model = pipeline("text-classification", model="CesarLeblanc/plantbert_text_classification_model") mask_model = pipeline("fill-mask", model="CesarLeblanc/plantbert_fill_mask_model") dataset = load_dataset("CesarLeblanc/plantbert_text_classification_dataset") def return_text(habitat_label, habitat_score, confidence): if habitat_score*100 > confidence: text = f"This vegetation plot belongs to the habitat {habitat_label} with the probability {habitat_score*100:.2f}%." else: text = f"We can't assign an habitat to this vegetation plot with a confidence of at least {confidence}%." return text def return_habitat_image(habitat_label, habitat_score, confidence): floraveg_url = f"https://floraveg.eu/habitat/overview/{habitat_label}" response = requests.get(floraveg_url) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') img_tag = soup.find('img', src=lambda x: x and x.startswith("https://files.ibot.cas.cz/cevs/images/syntaxa/thumbs/")) if img_tag: image_url = img_tag['src'] else: image_url = "https://www.salonlfc.com/wp-content/uploads/2018/01/image-not-found-scaled-1150x647.png" else: image_url = "https://www.salonlfc.com/wp-content/uploads/2018/01/image-not-found-scaled-1150x647.png" if habitat_score*100 < confidence: image_url = "https://www.salonlfc.com/wp-content/uploads/2018/01/image-not-found-scaled-1150x647.png" image_url = "https://www.commissionoceanindien.org/wp-content/uploads/2018/07/plantnet.jpg" image = gr.Image(value=image_url) return image def return_species_image(species): species = species[0].capitalize() + species[1:] floraveg_url = f"https://floraveg.eu/taxon/overview/{species}" response = requests.get(floraveg_url) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') img_tag = soup.find('img', src=lambda x: x and x.startswith("https://files.ibot.cas.cz/cevs/images/taxa/large/")) if img_tag: image_url = img_tag['src'] else: image_url = "https://www.salonlfc.com/wp-content/uploads/2018/01/image-not-found-scaled-1150x647.png" else: image_url = "https://www.salonlfc.com/wp-content/uploads/2018/01/image-not-found-scaled-1150x647.png" image_url = "https://www.commissionoceanindien.org/wp-content/uploads/2018/07/plantnet.jpg" image = gr.Image(value=image_url) return image def gbif_normalization(text): base = "https://api.gbif.org/v1" api = "species" function = "match" parameter = "name" url = f"{base}/{api}/{function}?{parameter}=" all_species = text.split(',') all_species = [species.strip() for species in all_species] species_gbif = [] for species in all_species: url = url.replace(url.partition('name')[2], f'={species}') r = requests.get(url) r = r.json() if 'species' in r: r = r["species"] else: r = species species_gbif.append(r) text = ", ".join(species_gbif) text = text.lower() return text def classification(text, typology, confidence): text = gbif_normalization(text) result = classification_model(text) habitat_label = result[0]['label'] habitat_label = dataset['train'].features['label'].names[int(habitat_label.split('_')[1])] habitat_score = result[0]['score'] formatted_output = return_text(habitat_label, habitat_score, confidence) image_output = return_habitat_image(habitat_label, habitat_score, confidence) return formatted_output, image_output def masking(text): text = gbif_normalization(text) masked_text = text + ', [MASK] [MASK]' pred_genus = mask_model(masked_text, top_k=10)[0] for d in pred_genus: d["score"] += random.uniform(0, 0.1) pred_genus.sort(key=lambda x: x["score"], reverse=True) for i in range(3): new_genus = pred_genus[i]['token_str'] masked_text = text + f', {new_genus} [MASK]' pred_epithet = mask_model(masked_text, top_k=3) for j in range(3): new_epithet = pred_epithet[j]['token_str'] new_species = new_genus + ' ' + new_epithet url_species = f"https://api.gbif.org/v1/species/match?name={new_species}" r = requests.get(url_species) r = r.json() if new_species not in text and r["matchType"] != "NONE": text = f"The last species from this vegetation plot is probably {new_species}." image = return_species_image(new_species) return text, image text = f"We can't find the last species from this vegetation plot." image = return_species_image(new_species) return text, image with gr.Blocks() as demo: gr.Markdown("""