File size: 3,132 Bytes
20c0d70
129cbf7
8007797
798614c
20c0d70
 
 
798614c
 
e81260e
 
 
 
8025b16
e81260e
 
 
 
 
 
 
 
798614c
8025b16
e81260e
20c0d70
 
8007797
20c0d70
 
798614c
 
20c0d70
 
 
 
 
798614c
 
8025b16
 
20c0d70
8007797
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e81260e
 
 
 
 
bd0ef8e
e81260e
 
 
 
 
 
8025b16
e81260e
 
20c0d70
8025b16
 
8007797
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
from recite_module import run
from chatbot_module import respond
from doc_bot import Qa
demo = gr.Blocks()


title = "El_Professor"
description = """
**El_Professor: Enhance Text Extraction from Images with Audio Transcription**

**How to Use:**

1. **Record Yourself**: Begin by recording yourself speaking the content that corresponds to the text in the image. .

2. **Upload Recorded Audio**: After recording, upload the audio file containing your speech. This audio will be used to enhance text extraction from the image.

3. **Upload Image**: Next, upload the image containing the text you want to extract. Ensure the text in the image is visible and clear.

4. **Check Your Advancement**: Once both the audio and image are uploaded, the application processes them to enhance text extraction. The output will display the processed image with highlighted text regions, showing your advancement in aligning spoken words with written text.

**Note:** This application aims to assist you in improving your ability to accurately transcribe spoken words from images. It may not provide perfect results in all cases, but it can help you track your progress and refine your transcription skills over time.
"""
im = "the-king-and-three-sisters-around-the-world-stories-for-children.png"

demo1 = gr.Interface(
    run,
    [gr.Audio(sources=["microphone"], type="numpy"), gr.Image(
        type="filepath", label="Image")],
    gr.Image(type="pil", label="output Image"),
    title=title,
    description=description
)
demo2 = gr.Interface(
    run,
    [gr.Audio(sources=["upload"]), gr.Image(
        type="filepath", label="Image")],
    [gr.Image(type="pil", label="output Image")],
    title=title,
    description=description,
    examples=[["Beginning.wav", im], ["Middels.wav", im]]
)
demo3 = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.",
                   label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512,
                  step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7,
                  step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

demo4 = gr.Interface(
    fn=Qa,
    inputs=[
        gr.Image(type="filepath", label="Upload Document"),
        gr.Textbox(label="Question")
    ],
    outputs=[
        gr.Textbox(label="Answer"),
        gr.Textbox(label="Conversations", type="text")
    ],
    title="Document-based Chatbot",
    examples=[[im, "how many sisters in the story", False]],
    description="This chatbot allows you to upload a document and ask questions. It can provide answers based on the content of the document as well as access information from the internet if enabled."
)
with demo:
    gr.TabbedInterface([demo2, demo4, demo1, demo3], [
                       "Audio File", " Document_Chatbot", " Microphone", "general_Chatbot"])
if __name__ == "__main__":
    demo.launch()