# Media_wiki_tab.py # Description: Gradio UI snippet that allows users to import a MediaWiki XML dump file into the application. # # Imports import os from threading import Thread # # 3rd-party Imports import gradio as gr # # Local Imports from App_Function_Libraries.MediaWiki.Media_Wiki import import_mediawiki_dump # ####################################################################################################################### # # Create MediaWiki Import Tab def create_mediawiki_import_tab(): with gr.Tab("MediaWiki Import"): gr.Markdown("# Import MediaWiki Dump") with gr.Row(): with gr.Column(): file_path = gr.File(label="MediaWiki XML Dump File") wiki_name = gr.Textbox(label="Wiki Name", placeholder="Enter a unique name for this wiki") namespaces = gr.Textbox(label="Namespaces (comma-separated integers, leave empty for all)") skip_redirects = gr.Checkbox(label="Skip Redirects", value=True) single_item = gr.Checkbox(label="Import as Single Item", value=False) chunk_method = gr.Dropdown( choices=["sentences", "words", "paragraphs", "tokens"], value="sentences", label="Chunking Method" ) chunk_size = gr.Slider(minimum=100, maximum=2000, value=1000, step=100, label="Chunk Size") chunk_overlap = gr.Slider(minimum=0, maximum=500, value=100, step=10, label="Chunk Overlap") import_button = gr.Button("Import MediaWiki Dump") cancel_button = gr.Button("Cancel Import", visible=False) with gr.Column(): output = gr.Markdown(label="Import Status") progress_bar = gr.Progress() def validate_inputs(file_path, wiki_name, namespaces): if not file_path: return "Please select a MediaWiki XML dump file." if not wiki_name: return "Please enter a name for the wiki." if namespaces: try: [int(ns.strip()) for ns in namespaces.split(',')] except ValueError: return "Invalid namespaces. Please enter comma-separated integers." return None def check_file_size(file_path): max_size_mb = 1000 # 1 GB file_size_mb = os.path.getsize(file_path) / (1024 * 1024) if file_size_mb > max_size_mb: return f"Warning: The selected file is {file_size_mb:.2f} MB. Importing large files may take a long time." return None import_thread = None cancel_flag = False def run_import(file_path, wiki_name, namespaces, skip_redirects, single_item, chunk_method, chunk_size, chunk_overlap, progress=gr.Progress()): validation_error = validate_inputs(file_path, wiki_name, namespaces) if validation_error: return gr.update(), gr.update(), validation_error file_size_warning = check_file_size(file_path.name) status_text = "# MediaWiki Import Process\n\n## Initializing\n- Starting import process...\n" if file_size_warning: status_text += f"- {file_size_warning}\n" chunk_options = { 'method': chunk_method, 'max_size': chunk_size, 'overlap': chunk_overlap, 'adaptive': True, 'language': 'en' } namespaces_list = [int(ns.strip()) for ns in namespaces.split(',')] if namespaces else None pages_processed = 0 try: for progress_info in import_mediawiki_dump( file_path=file_path.name, wiki_name=wiki_name, namespaces=namespaces_list, skip_redirects=skip_redirects, chunk_options=chunk_options, single_item=single_item, progress_callback=progress ): if progress_info.startswith("Found"): status_text += f"\n## Parsing\n- {progress_info}\n" elif progress_info.startswith("Processed page"): pages_processed += 1 if pages_processed % 10 == 0: # Update every 10 pages to avoid too frequent updates status_text += f"- {progress_info}\n" elif progress_info.startswith("Successfully imported"): status_text += f"\n## Completed\n- {progress_info}\n- Total pages processed: {pages_processed}" else: status_text += f"- {progress_info}\n" yield gr.update(), gr.update(), status_text status_text += "\n## Import Process Completed Successfully" except Exception as e: status_text += f"\n## Error\n- An error occurred during the import process: {str(e)}" yield gr.update(visible=False), gr.update(visible=True), status_text def start_import(*args): nonlocal import_thread import_thread = Thread(target=run_import, args=args) import_thread.start() return gr.update(visible=True), gr.update(visible=False), gr.update( value="Import process started. Please wait...") def cancel_import(): nonlocal cancel_flag cancel_flag = True return gr.update(visible=False), gr.update(visible=True) import_button.click( run_import, inputs=[file_path, wiki_name, namespaces, skip_redirects, single_item, chunk_method, chunk_size, chunk_overlap], outputs=[cancel_button, import_button, output] ) cancel_button.click( cancel_import, outputs=[cancel_button, import_button] ) return file_path, wiki_name, namespaces, skip_redirects, single_item, chunk_method, chunk_size, chunk_overlap, import_button, output # # End of MediaWiki Import Tab #######################################################################################################################