File size: 1,180 Bytes
eea58a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e68599c
 
 
 
 
 
 
eea58a2
 
e68599c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import requests
import gradio as gr

def extract_text_from_url(url):
    try:
        # Request data from the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        data = response.json()
        
        # Extract 'text' from each row
        rows = data.get("rows", [])
        texts = [row["row"]["text"] for row in rows if "text" in row["row"]]
        
        # Return as a single string with newlines
        return "\n".join(texts)
    except Exception as e:
        return f"An error occurred: {e}"

# Gradio interface
interface = gr.Interface(
    fn=extract_text_from_url,
    inputs=gr.Textbox(label="Dataset URL", placeholder="Enter the dataset URL"),
    outputs=gr.Textbox(label="Extracted Texts", lines=20, placeholder="Extracted texts will appear here"),
    title="Extract Text from Hugging Face Dataset",
    description="Enter the URL of a Hugging Face dataset to extract and display the 'text' fields. https://datasets-server.huggingface.co/rows?dataset=pszemraj%2Fmidjourney-messages-cleaned&config=deduped&split=train&length=100&offset=0"
)

if __name__ == "__main__":
    interface.launch()