Spaces:
Running
Running
File size: 6,527 Bytes
41ba402 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
# Media_wiki_tab.py
# Description: Gradio UI snippet that allows users to import a MediaWiki XML dump file into the application.
#
# Imports
import os
from threading import Thread
#
# 3rd-party Imports
import gradio as gr
#
# Local Imports
from App_Function_Libraries.MediaWiki.Media_Wiki import import_mediawiki_dump
#
#######################################################################################################################
#
# Create MediaWiki Import Tab
def create_mediawiki_import_tab():
with gr.Tab("MediaWiki Import"):
gr.Markdown("# Import MediaWiki Dump")
with gr.Row():
with gr.Column():
file_path = gr.File(label="MediaWiki XML Dump File")
wiki_name = gr.Textbox(label="Wiki Name", placeholder="Enter a unique name for this wiki")
namespaces = gr.Textbox(label="Namespaces (comma-separated integers, leave empty for all)")
skip_redirects = gr.Checkbox(label="Skip Redirects", value=True)
single_item = gr.Checkbox(label="Import as Single Item", value=False)
chunk_method = gr.Dropdown(
choices=["sentences", "words", "paragraphs", "tokens"],
value="sentences",
label="Chunking Method"
)
chunk_size = gr.Slider(minimum=100, maximum=2000, value=1000, step=100, label="Chunk Size")
chunk_overlap = gr.Slider(minimum=0, maximum=500, value=100, step=10, label="Chunk Overlap")
import_button = gr.Button("Import MediaWiki Dump")
cancel_button = gr.Button("Cancel Import", visible=False)
with gr.Column():
output = gr.Markdown(label="Import Status")
progress_bar = gr.Progress()
def validate_inputs(file_path, wiki_name, namespaces):
if not file_path:
return "Please select a MediaWiki XML dump file."
if not wiki_name:
return "Please enter a name for the wiki."
if namespaces:
try:
[int(ns.strip()) for ns in namespaces.split(',')]
except ValueError:
return "Invalid namespaces. Please enter comma-separated integers."
return None
def check_file_size(file_path):
max_size_mb = 1000 # 1 GB
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
if file_size_mb > max_size_mb:
return f"Warning: The selected file is {file_size_mb:.2f} MB. Importing large files may take a long time."
return None
import_thread = None
cancel_flag = False
def run_import(file_path, wiki_name, namespaces, skip_redirects, single_item, chunk_method, chunk_size,
chunk_overlap, progress=gr.Progress()):
validation_error = validate_inputs(file_path, wiki_name, namespaces)
if validation_error:
return gr.update(), gr.update(), validation_error
file_size_warning = check_file_size(file_path.name)
status_text = "# MediaWiki Import Process\n\n## Initializing\n- Starting import process...\n"
if file_size_warning:
status_text += f"- {file_size_warning}\n"
chunk_options = {
'method': chunk_method,
'max_size': chunk_size,
'overlap': chunk_overlap,
'adaptive': True,
'language': 'en'
}
namespaces_list = [int(ns.strip()) for ns in namespaces.split(',')] if namespaces else None
pages_processed = 0
try:
for progress_info in import_mediawiki_dump(
file_path=file_path.name,
wiki_name=wiki_name,
namespaces=namespaces_list,
skip_redirects=skip_redirects,
chunk_options=chunk_options,
single_item=single_item,
progress_callback=progress
):
if progress_info.startswith("Found"):
status_text += f"\n## Parsing\n- {progress_info}\n"
elif progress_info.startswith("Processed page"):
pages_processed += 1
if pages_processed % 10 == 0: # Update every 10 pages to avoid too frequent updates
status_text += f"- {progress_info}\n"
elif progress_info.startswith("Successfully imported"):
status_text += f"\n## Completed\n- {progress_info}\n- Total pages processed: {pages_processed}"
else:
status_text += f"- {progress_info}\n"
yield gr.update(), gr.update(), status_text
status_text += "\n## Import Process Completed Successfully"
except Exception as e:
status_text += f"\n## Error\n- An error occurred during the import process: {str(e)}"
yield gr.update(visible=False), gr.update(visible=True), status_text
def start_import(*args):
nonlocal import_thread
import_thread = Thread(target=run_import, args=args)
import_thread.start()
return gr.update(visible=True), gr.update(visible=False), gr.update(
value="Import process started. Please wait...")
def cancel_import():
nonlocal cancel_flag
cancel_flag = True
return gr.update(visible=False), gr.update(visible=True)
import_button.click(
run_import,
inputs=[file_path, wiki_name, namespaces, skip_redirects, single_item, chunk_method, chunk_size,
chunk_overlap],
outputs=[cancel_button, import_button, output]
)
cancel_button.click(
cancel_import,
outputs=[cancel_button, import_button]
)
return file_path, wiki_name, namespaces, skip_redirects, single_item, chunk_method, chunk_size, chunk_overlap, import_button, output
#
# End of MediaWiki Import Tab
#######################################################################################################################
|