harheem commited on
Commit
3bfe3dd
·
verified ·
1 Parent(s): e1e9659

Upload project files

Browse files
README.md CHANGED
@@ -1,14 +1,10 @@
1
  ---
2
- title: Hf Transformers Docs I18n Agent
3
- emoji:
4
- colorFrom: green
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.33.1
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
- short_description: Translation agent for Hugging Face Transformers docs
12
  ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: I18n Agent
3
+ emoji: 🤖
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: "5.33.0"
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
 
 
agent/handler.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module for gradio chat-based translation agent interface."""
2
+
3
+ import os
4
+ import re
5
+ from pathlib import Path
6
+
7
+ import gradio as gr
8
+
9
+ from agent.workflow import (
10
+ report_translation_target_files,
11
+ translate_docs_interactive,
12
+ generate_github_pr,
13
+ )
14
+ from pr_generator.searcher import find_reference_pr_simple_stream
15
+
16
+
17
+ # State management
18
+ class ChatState:
19
+ def __init__(self):
20
+ self.step = "welcome" # welcome -> find_files -> translate -> create_github_pr
21
+ self.target_language = "ko"
22
+ self.k_files = 10
23
+ self.files_to_translate = []
24
+ self.current_file_content = {"translated": ""}
25
+ self.pr_result = None # Store PR creation result
26
+ # GitHub configuration
27
+ self.github_config = {
28
+ "token": "",
29
+ "owner": "",
30
+ "repo_name": "",
31
+ "reference_pr_url": "https://github.com/huggingface/transformers/pull/24968",
32
+ }
33
+
34
+
35
+ state = ChatState()
36
+
37
+
38
+ def _extract_content_for_display(content: str) -> str:
39
+ """Extract text from document for display."""
40
+ # Remove Copyright header
41
+ to_translate = re.sub(r"<!--.*?-->", "", content, count=1, flags=re.DOTALL)
42
+ to_translate = to_translate.strip()
43
+ ## remove code blocks from text
44
+ to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
45
+ ## remove markdown tables from text
46
+ to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
47
+ ## remove empty lines from text
48
+ to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
49
+
50
+ return to_translate
51
+
52
+
53
+ def get_welcome_message():
54
+ """Initial welcome message with file finding controls"""
55
+ return """**👋 Welcome to 🌐 Hugging Face i18n Translation Agent!**
56
+
57
+ I'll help you find files that need translation and translate them in a streamlined workflow.
58
+
59
+ **🔎 Let's start by finding files that need translation.**
60
+
61
+ Use the **`Quick Controls`** on the right or **ask me `what`, `how`, or `help`** to get started.
62
+ """
63
+
64
+
65
+ def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
66
+ """Process file search request and update Gradio UI components."""
67
+ global state
68
+ state.target_language = lang
69
+ state.k_files = k
70
+ state.step = "find_files"
71
+
72
+ status_report, files_list = report_translation_target_files(lang, k)
73
+ state.files_to_translate = [file[0] for file in files_list] if files_list else []
74
+
75
+ response = f"""**✅ File search completed!**
76
+
77
+ **Status Report:**
78
+ {status_report}
79
+
80
+ **📁 Found first {len(state.files_to_translate)} files to translate:**
81
+ """
82
+
83
+ if state.files_to_translate:
84
+ for i, file in enumerate(state.files_to_translate[:5], 1): # Show first 5
85
+ response += f"\n{i}. `{file}`"
86
+
87
+ if len(state.files_to_translate) > 5:
88
+ response += f"\n... and {len(state.files_to_translate) - 5} more files"
89
+
90
+ response += "\n\n**🚀 Ready to start translation?**\nI can begin translating these files one by one. Would you like to proceed?"
91
+ else:
92
+ response += "\nNo files found that need translation."
93
+
94
+ # Add to history
95
+ history.append(["Please find files that need translation", response])
96
+ cleared_input = ""
97
+ selected_tab = 1 if state.files_to_translate else 0
98
+
99
+ return history, cleared_input, update_status(), gr.Tabs(selected=selected_tab)
100
+
101
+
102
+ def start_translation_process():
103
+ """Start the translation process for the first file"""
104
+ if not state.files_to_translate:
105
+ return "❌ No files available for translation."
106
+
107
+ current_file = state.files_to_translate[0]
108
+
109
+ # Call translation function (simplified for demo)
110
+ try:
111
+ status, translated = translate_docs_interactive(
112
+ state.target_language, [[current_file]]
113
+ )
114
+
115
+ state.current_file_content = {"translated": translated}
116
+ path = (
117
+ Path(__file__).resolve().parent.parent
118
+ / f"translation_result/{current_file}"
119
+ )
120
+ p = Path(path)
121
+ p.parent.mkdir(parents=True, exist_ok=True)
122
+ p.write_text(translated, encoding="utf-8")
123
+
124
+ original_file_link = (
125
+ "https://github.com/huggingface/transformers/blob/main/" + current_file
126
+ )
127
+ response = (
128
+ f"""🔄 Translation for: `{current_file}`**\n"""
129
+ "**📄 Original Content Link:**\n"
130
+ ""
131
+ f"{original_file_link}\n"
132
+ "**🌐 Translated Content:**\n"
133
+ f"\n```\n\n{_extract_content_for_display(translated)}```\n"
134
+ f"{status}\n"
135
+ )
136
+ print("translated:")
137
+ print(translated)
138
+ print("extracted")
139
+
140
+ except Exception as e:
141
+ response = f"❌ Translation failed: {str(e)}"
142
+ response += "\n**➡️ Please try from the beginning.**"
143
+
144
+ return response
145
+
146
+
147
+ def handle_general_message(message):
148
+ """Handle general messages"""
149
+ message_lower = message.lower()
150
+
151
+ if any(word in message_lower for word in ["help", "what", "how"]):
152
+ return """**🤖 I'm your Hugging Face i18n Translation Agent!**
153
+
154
+ I can help you:
155
+ 1. **🔍 Find files** that need translation
156
+ 2. **🌐 Translate documents** using AI
157
+ 3. **📋 Review translations** for quality
158
+ 4. **🚀 Create GitHub PR** for translation
159
+
160
+ Currently available actions with quick controls:
161
+ - "find files" - Search for files needing translation
162
+ - "translate" - Start translation process
163
+ - "review" - Review current translation
164
+ - "github" - Create GitHub Pull Request
165
+ - "restart" - Start over"""
166
+
167
+ elif "restart" in message_lower:
168
+ global state
169
+ state = ChatState()
170
+ return get_welcome_message()
171
+
172
+ else:
173
+ return """I understand you want to work on translations!
174
+
175
+ To get started, please use the controls above to configure your translation settings and find files that need translation.
176
+ """
177
+
178
+
179
+ # Main handler
180
+ def handle_user_message(message, history):
181
+ """Handle user messages and provide appropriate responses"""
182
+ global state
183
+
184
+ if not message.strip():
185
+ return history, ""
186
+
187
+ elif state.step == "find_files" and any(
188
+ word in message.lower()
189
+ for word in ["yes", "proceed", "start", "translate", "translation"]
190
+ ):
191
+ # User wants to start translation
192
+ if state.files_to_translate:
193
+ state.step = "translate"
194
+ response = start_translation_process()
195
+ else:
196
+ response = (
197
+ "❌ No files available for translation. Please search for files first."
198
+ )
199
+
200
+ # Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
201
+ else:
202
+ # General response
203
+ response = handle_general_message(message)
204
+
205
+ history.append([message, response])
206
+ return history, ""
207
+
208
+
209
+ def update_status():
210
+ if state.step == "welcome":
211
+ return """
212
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px;padding: 10px; background: rgba(0, 0, 0, 0.25); border-radius: 8px;">
213
+ <div><strong>🔄 Step:</strong> Welcome</div>
214
+ <div><strong>📁 Files:</strong> 0</div>
215
+ <div><strong>🌍 Language:</strong> ko</div>
216
+ <div><strong>⏳ Progress:</strong> Ready</div>
217
+ </div>
218
+ """
219
+
220
+ step_map = {
221
+ "welcome": "Welcome",
222
+ "find_files": "Finding Files",
223
+ "translate": "Translating",
224
+ "review": "Reviewing",
225
+ "create_github_pr": "Creating PR",
226
+ }
227
+
228
+ progress_map = {
229
+ "welcome": "Ready to start",
230
+ "find_files": "Files found",
231
+ "translate": f"{len(state.files_to_translate)} remaining",
232
+ "review": "Review complete",
233
+ "create_github_pr": "PR generation in progress",
234
+ }
235
+
236
+ # Check GitHub configuration status
237
+ github_status = "❌ Not configured"
238
+ if all(
239
+ [
240
+ state.github_config["token"],
241
+ state.github_config["owner"],
242
+ state.github_config["repo_name"],
243
+ ]
244
+ ):
245
+ github_status = (
246
+ f"✅ {state.github_config['owner']}/{state.github_config['repo_name']}"
247
+ )
248
+
249
+ status_html = f"""
250
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px; padding: 10px; background: rgba(0, 0, 0, 0.25); border-radius: 8px;">
251
+ <div><strong>🔄 Step:</strong> {step_map.get(state.step, state.step)}</div>
252
+ <div><strong>📁 Files:</strong> {len(state.files_to_translate)}</div>
253
+ <div><strong>🌍 Language:</strong> {state.target_language}</div>
254
+ <div><strong>⏳ Progress:</strong> {progress_map.get(state.step, 'In progress')}</div>
255
+ <div><strong>🔧 GitHub:</strong> {github_status}</div>
256
+ </div>
257
+ """
258
+
259
+ return status_html
260
+
261
+
262
+ # Event handlers
263
+
264
+
265
+ def sync_language_displays(lang):
266
+ return lang
267
+
268
+
269
+ def update_github_config(token, owner, repo, reference_pr_url):
270
+ """Update GitHub configuration settings."""
271
+ global state
272
+
273
+ # Set GitHub token in environment variables
274
+ if token:
275
+ os.environ["GITHUB_TOKEN"] = token
276
+
277
+ # Save GitHub configuration to state
278
+ state.github_config.update(
279
+ {
280
+ "token": token,
281
+ "owner": owner,
282
+ "repo_name": repo,
283
+ "reference_pr_url": reference_pr_url
284
+ or state.github_config["reference_pr_url"],
285
+ }
286
+ )
287
+
288
+ return f"✅ GitHub configuration updated: {owner}/{repo}"
289
+
290
+
291
+ def send_message(message, history):
292
+ new_history, cleared_input = handle_user_message(message, history)
293
+ return new_history, cleared_input, update_status()
294
+
295
+
296
+ # Button handlers with tab switching
297
+ def start_translate_handler(history, anthropic_key):
298
+ os.environ["ANTHROPIC_API_KEY"] = anthropic_key
299
+ new_hist, cleared_input = handle_user_message("start translation", history)
300
+ selected_tabs = 2 if state.current_file_content["translated"] else 0
301
+ return new_hist, cleared_input, update_status(), gr.Tabs(selected=selected_tabs)
302
+
303
+
304
+ def approve_handler(history, owner, repo, reference_pr_url):
305
+ """Handles the request to generate a GitHub PR."""
306
+ global state
307
+ state.step = "create_github_pr"
308
+
309
+ # Update github config from the latest UI values
310
+ state.github_config["owner"] = owner
311
+ state.github_config["repo_name"] = repo
312
+ state.github_config["reference_pr_url"] = reference_pr_url
313
+
314
+ # Validate GitHub configuration
315
+ github_config = state.github_config
316
+ if not all([github_config.get("token"), owner, repo]):
317
+ response = "❌ GitHub configuration incomplete. Please provide GitHub Token, Owner, and Repository Name in Tab 3."
318
+ history.append(["GitHub PR creation request", response])
319
+ return history, "", update_status()
320
+
321
+ # If reference PR is not provided, use the agent to find one
322
+ if not github_config.get("reference_pr_url"):
323
+ response = "🤖 **Reference PR URL not found. The agent will now search for a suitable one...**"
324
+ try:
325
+ # This part is simplified to avoid streaming logic in a non-generator function
326
+ stream_gen = find_reference_pr_simple_stream(
327
+ target_language=state.target_language,
328
+ context="documentation translation",
329
+ )
330
+ # We will just get the final result from the generator
331
+ final_result = None
332
+ try:
333
+ while True:
334
+ # We are not interested in the streamed messages here, just the final result.
335
+ next(stream_gen)
336
+ except StopIteration as e:
337
+ final_result = e.value
338
+
339
+ if final_result and final_result.get("status") == "success":
340
+ result_text = final_result.get("result", "")
341
+ match = re.search(r"https://github.com/[^\s]+", result_text)
342
+ if match:
343
+ found_url = match.group(0)
344
+ state.github_config["reference_pr_url"] = found_url
345
+ response += f"\n✅ **Agent found a reference PR:** {found_url}"
346
+ else:
347
+ raise ValueError(
348
+ "Could not extract a valid PR URL from agent's response."
349
+ )
350
+ else:
351
+ error_message = final_result.get("message") or final_result.get(
352
+ "result", "Unknown error"
353
+ )
354
+ raise ValueError(f"Agent failed to find a PR. Reason: {error_message}")
355
+ except Exception as e:
356
+ response += f"\n❌ **Agent failed to find a reference PR.**\nReason: {e}\n\nPlease provide a reference PR URL manually in Tab 3 and try again."
357
+ history.append(["Agent searching for PR", response])
358
+ return history, "", update_status()
359
+
360
+ # Proceed with PR generation
361
+ if state.files_to_translate and state.current_file_content.get("translated"):
362
+ current_file = state.files_to_translate[0]
363
+ translated_content = state.current_file_content["translated"]
364
+ response += "\n\n🚀 **Generating GitHub PR...**"
365
+
366
+ pr_response = generate_github_pr(
367
+ target_language=state.target_language,
368
+ filepath=current_file,
369
+ translated_content=translated_content,
370
+ github_config=state.github_config,
371
+ )
372
+ response += f"\n{pr_response}"
373
+ else:
374
+ response = "❌ No translated file available. Please complete the translation process first."
375
+
376
+ history.append(["GitHub PR creation request", response])
377
+ return history, "", update_status()
378
+
379
+
380
+ def restart_handler(history):
381
+ """Resets the state and UI."""
382
+ global state
383
+ state = ChatState()
384
+ welcome_msg = get_welcome_message()
385
+ new_hist = [[None, welcome_msg]]
386
+ return new_hist, "", update_status(), gr.Tabs(selected=0)
agent/workflow.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module for gradio interfaces."""
2
+
3
+ import os
4
+ from pathlib import Path
5
+ import gradio as gr
6
+
7
+ from translator.content import (
8
+ fill_scaffold,
9
+ get_content,
10
+ get_full_prompt,
11
+ llm_translate,
12
+ preprocess_content,
13
+ )
14
+ from translator.retriever import report
15
+
16
+ # GitHub PR Agent import
17
+ try:
18
+ from pr_generator.agent import GitHubPRAgent
19
+
20
+ GITHUB_PR_AVAILABLE = True
21
+ except ImportError as e:
22
+ print(f"⚠️ GitHub PR Agent is not available: {e}")
23
+ GITHUB_PR_AVAILABLE = False
24
+
25
+ # GitHub configuration - must be provided by user or environment variables
26
+
27
+
28
+ def report_translation_target_files(
29
+ translate_lang: str, top_k: int = 1
30
+ ) -> tuple[str, list[list[str]]]:
31
+ """Return the top-k files that need translation.
32
+
33
+ Args:
34
+ translate_lang: Target language to translate
35
+ top_k: Number of top-first files to return for translation. (Default 1)
36
+ """
37
+ status_report, filepath_list = report(translate_lang, top_k)
38
+ return status_report, [[file] for file in filepath_list]
39
+
40
+
41
+ def translate_docs(lang: str, file_path: str) -> tuple[str, str]:
42
+ """Translate documentation."""
43
+ # step 1. Get content from file path
44
+ content = get_content(file_path)
45
+ to_translate = preprocess_content(content)
46
+
47
+ # step 2. Prepare prompt with docs content
48
+ if lang == "ko":
49
+ translation_lang = "Korean"
50
+ to_translate_with_prompt = get_full_prompt(translation_lang, to_translate)
51
+
52
+ # step 3. Translate with LLM
53
+ # TODO: MCP clilent 넘길 부분
54
+ callback_result, translated_content = llm_translate(to_translate_with_prompt)
55
+
56
+ # step 4. Add scaffold to translation result
57
+ translated_doc = fill_scaffold(content, to_translate, translated_content)
58
+
59
+ return callback_result, translated_doc
60
+
61
+
62
+ def translate_docs_interactive(
63
+ translate_lang: str, selected_files: list[list[str]]
64
+ ) -> tuple[str, str, str]:
65
+ """Interactive translation function that processes files one by one.
66
+
67
+ Args:
68
+ translate_lang: Target language to translate
69
+ selected_files: List of file paths to translate
70
+ """
71
+ # Extract file paths from the dataframe format
72
+ file_paths = [row[0] for row in selected_files if row and len(row) > 0]
73
+ if not file_paths:
74
+ return (
75
+ "No files selected for translation.",
76
+ gr.update(visible=False),
77
+ gr.update(visible=False),
78
+ gr.update(visible=False),
79
+ [],
80
+ 0,
81
+ )
82
+
83
+ # Start with the first file
84
+ current_file = file_paths[0]
85
+
86
+ status = f"✅ Translation completed: `{current_file}` → `{translate_lang}`\n\n"
87
+ callback_result, translated_content = translate_docs(translate_lang, current_file)
88
+ status += f"💰 Used token and cost: \n```\n{callback_result}\n```"
89
+
90
+ if len(file_paths) > 1:
91
+ status += f"\n### 📝 Note: Currently, only the first file has been translated.\n> The remaining {len(file_paths) - 1} files have not been processed yet, as the system is in its beta version"
92
+
93
+ return status, translated_content
94
+
95
+
96
+ def generate_github_pr(
97
+ target_language: str,
98
+ filepath: str,
99
+ translated_content: str = None,
100
+ github_config: dict = None,
101
+ ) -> str:
102
+ """Generate a GitHub PR for translated documentation.
103
+
104
+ Args:
105
+ target_language: Target language for translation (e.g., "ko")
106
+ filepath: Original file path (e.g., "docs/source/en/accelerator_selection.md")
107
+ translated_content: Translated content (if None, read from file)
108
+ github_config: GitHub configuration dictionary
109
+
110
+ Returns:
111
+ PR creation result message
112
+ """
113
+ if not GITHUB_PR_AVAILABLE:
114
+ return "❌ GitHub PR Agent is not available. Please install required libraries."
115
+
116
+ if not github_config:
117
+ return "❌ GitHub configuration not provided."
118
+
119
+ # Validate required configuration
120
+ required_fields = ["token", "owner", "repo_name", "reference_pr_url"]
121
+ missing_fields = [
122
+ field for field in required_fields if not github_config.get(field)
123
+ ]
124
+
125
+ if missing_fields:
126
+ return f"❌ Missing required configuration: {', '.join(missing_fields)}. Please provide these values."
127
+
128
+ # Set token in environment for the agent.
129
+ os.environ["GITHUB_TOKEN"] = github_config["token"]
130
+
131
+ try:
132
+ # Read translated content from file if not provided
133
+ if translated_content is None:
134
+ translation_file_path = (
135
+ Path(__file__).resolve().parent.parent
136
+ / f"translation_result/{filepath}"
137
+ )
138
+ if not translation_file_path.exists():
139
+ return f"❌ Translation file not found: {translation_file_path}"
140
+
141
+ with open(translation_file_path, "r", encoding="utf-8") as f:
142
+ translated_content = f.read()
143
+
144
+ if not translated_content or not translated_content.strip():
145
+ return "❌ Translated content is empty."
146
+
147
+ # Execute GitHub PR Agent
148
+ print(f"🚀 Starting GitHub PR creation...")
149
+ print(f" 📁 File: {filepath}")
150
+ print(f" 🌍 Language: {target_language}")
151
+ print(f" 📊 Reference PR: {github_config['reference_pr_url']}")
152
+ print(
153
+ f" 🏠 Repository: {github_config['owner']}/{github_config['repo_name']}"
154
+ )
155
+
156
+ agent = GitHubPRAgent()
157
+ result = agent.run_translation_pr_workflow(
158
+ reference_pr_url=github_config["reference_pr_url"],
159
+ target_language=target_language,
160
+ filepath=filepath,
161
+ translated_doc=translated_content,
162
+ owner=github_config["owner"],
163
+ repo_name=github_config["repo_name"],
164
+ base_branch=github_config.get("base_branch", "main"),
165
+ )
166
+
167
+ # Process result
168
+ if result["status"] == "success":
169
+ return f"""✅ **GitHub PR Creation Successful!**
170
+
171
+ 🔗 **PR URL:** {result["pr_url"]}
172
+ 🌿 **Branch:** {result["branch"]}
173
+ 📁 **File:** {result["file_path"]}
174
+
175
+ {result["message"]}"""
176
+
177
+ elif result["status"] == "partial_success":
178
+ return f"""⚠️ **Partial Success**
179
+
180
+ 🌿 **Branch:** {result["branch"]}
181
+ 📁 **File:** {result["file_path"]}
182
+
183
+ {result["message"]}
184
+
185
+ **Error Details:**
186
+ {result.get("error_details", "Unknown error")}"""
187
+
188
+ else:
189
+ return f"""❌ **GitHub PR Creation Failed**
190
+
191
+ **Error Message:**
192
+ {result["message"]}"""
193
+
194
+ except Exception as e:
195
+ error_msg = f"❌ Unexpected error occurred during PR creation: {str(e)}"
196
+ print(error_msg)
197
+ return error_msg
198
+
199
+
200
+ # Backward compatibility function (replaces old mock function)
201
+ def mock_generate_PR():
202
+ """Backward compatibility function - returns warning message only"""
203
+ return (
204
+ "⚠️ mock_generate_PR() is deprecated. Please use generate_github_pr() instead."
205
+ )
app.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module for gradio chat-based translation agent interface."""
2
+
3
+ import base64
4
+ import os
5
+
6
+ import gradio as gr
7
+ from dotenv import load_dotenv
8
+
9
+ from agent.handler import (
10
+ approve_handler,
11
+ get_welcome_message,
12
+ process_file_search_handler,
13
+ restart_handler,
14
+ send_message,
15
+ start_translate_handler,
16
+ sync_language_displays,
17
+ update_status,
18
+ update_github_config,
19
+ )
20
+ from translator.model import Languages
21
+
22
+ load_dotenv()
23
+
24
+
25
+ css = """
26
+ .gradio-container {
27
+ background: linear-gradient(135deg, #ffeda7 0%, #ffbebf 100%);
28
+ }
29
+ .chat-container {
30
+ background: rgba(255, 255, 180, 0.25);
31
+ border-radius: 18px;
32
+ box-shadow: 0 4px 24px rgba(0,0,0,0.08);
33
+ padding: 1.5em;
34
+ backdrop-filter: blur(8px);
35
+ border: 1px solid rgba(255,255,180,0.25);
36
+ width: 100%;
37
+ height: 100%;
38
+ }
39
+ .control-panel {
40
+ background: rgba(255, 255, 180, 0.25);
41
+ border-radius: 18px;
42
+ box-shadow: 0 4px 24px rgba(0,0,0,0.08);
43
+ padding: 1.5em;
44
+ backdrop-filter: blur(8px);
45
+ border: 1px solid rgba(255,255,180,0.25);
46
+ width: 100%;
47
+ }
48
+ .status-card {
49
+ width: 100%
50
+ }
51
+ .action-button {
52
+ background: linear-gradient(135deg, #ff8c8c 0%, #f9a889 100%) !important;
53
+ color: white !important;
54
+ border: none !important;
55
+ font-weight: 600 !important;
56
+ box-shadow: 0 4px 12px rgba(0,0,0,0.15) !important;
57
+ transition: all 0.3s ease-in-out !important;
58
+ }
59
+ .action-button:hover {
60
+ background: linear-gradient(135deg, #f9a889 0%, #ff8c8c 100%) !important;
61
+ box-shadow: 0 6px 16px rgba(0,0,0,0.2) !important;
62
+ transform: translateY(-2px) !important;
63
+ }
64
+
65
+ .simple-tabs .tab-nav button {
66
+ background: transparent !important;
67
+ color: #4A5568 !important;
68
+ box-shadow: none !important;
69
+ transform: none !important;
70
+ border: none !important;
71
+ border-bottom: 2px solid #E2E8F0 !important;
72
+ border-radius: 0 !important;
73
+ font-weight: 600 !important;
74
+ }
75
+
76
+ .simple-tabs .tab-nav button.selected {
77
+ color: #f97316 !important;
78
+ border-bottom: 2px solid #f97316 !important;
79
+ }
80
+
81
+ .simple-tabs .tab-nav button:hover {
82
+ background: #f3f4f6 !important;
83
+ color: #f97316 !important;
84
+ box-shadow: none !important;
85
+ transform: none !important;
86
+ }
87
+ """
88
+
89
+
90
+ # Create the main interface
91
+ with gr.Blocks(
92
+ css=css, title=" 🌐 Hugging Face Transformers Docs i18n made easy"
93
+ ) as demo:
94
+
95
+ # Title
96
+ with open("images/hfkr_logo.png", "rb") as img_file:
97
+ base64_img = base64.b64encode(img_file.read()).decode()
98
+ gr.Markdown(
99
+ f'<img src="data:image/png;base64,{base64_img}" style="display: block; margin-left: auto; margin-right: auto; height: 15em;"/>'
100
+ )
101
+ gr.Markdown(
102
+ '<h1 style="text-align: center;"> 🌐 Hugging Face Transformers Docs i18n made easy</h1>'
103
+ )
104
+
105
+ # Content
106
+ with gr.Row():
107
+ # Chat interface
108
+ with gr.Column(scale=4, elem_classes=["chat-container"]):
109
+ gr.Markdown("### 🌐 Hugging Face i18n Agent")
110
+
111
+ chatbot = gr.Chatbot(
112
+ value=[[None, get_welcome_message()]], scale=1, height=585
113
+ )
114
+
115
+ # Controller interface
116
+ with gr.Column(scale=2):
117
+ # Quick Controller
118
+ with gr.Column(elem_classes=["control-panel"]):
119
+ gr.Markdown("### 🛠️ Quick Controls")
120
+ status_display = gr.HTML(update_status())
121
+
122
+ with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
123
+ with gr.TabItem("1. Find Files", id=0):
124
+ with gr.Group():
125
+ lang_dropdown = gr.Dropdown(
126
+ choices=[language.value for language in Languages],
127
+ label="🌍 Translate To",
128
+ value="ko",
129
+ )
130
+ k_input = gr.Number(
131
+ label="📊 First k missing translated docs",
132
+ value=1,
133
+ minimum=1,
134
+ maximum=100,
135
+ )
136
+ find_btn = gr.Button(
137
+ "🔍 Find Files to Translate",
138
+ elem_classes="action-button",
139
+ )
140
+
141
+ with gr.TabItem("2. Translate", id=1):
142
+ with gr.Group():
143
+ translate_lang_display = gr.Dropdown(
144
+ choices=[language.value for language in Languages],
145
+ label="🌍 Translation Language",
146
+ value="ko",
147
+ interactive=False,
148
+ )
149
+ anthropic_key = gr.Textbox(
150
+ label="🔑 Anthropic API key for translation generation",
151
+ type="password",
152
+ )
153
+ start_translate_btn = gr.Button(
154
+ "🚀 Start Translation", elem_classes="action-button"
155
+ )
156
+
157
+ with gr.TabItem("3. Upload PR", id=2):
158
+ with gr.Group():
159
+ github_token = gr.Textbox(
160
+ label="🔑 GitHub Token",
161
+ type="password",
162
+ placeholder="ghp_xxxxxxxxxxxxxxxxxxxx",
163
+ )
164
+ github_owner = gr.Textbox(
165
+ label="👤 GitHub Owner/Username",
166
+ placeholder="your-username",
167
+ )
168
+ github_repo = gr.Textbox(
169
+ label="📁 Repository Name",
170
+ placeholder="your-repository",
171
+ )
172
+ reference_pr_url = gr.Textbox(
173
+ label="🔗 Reference PR URL (Optional - Agent will find one if not provided)",
174
+ placeholder="reference PR URL",
175
+ )
176
+
177
+ save_config_btn = gr.Button(
178
+ "💾 Save GitHub Config", elem_classes="action-button"
179
+ )
180
+ approve_btn = gr.Button(
181
+ "✅ Generate GitHub PR", elem_classes="action-button"
182
+ )
183
+ restart_btn = gr.Button(
184
+ "🔄 Restart Translation", elem_classes="action-button"
185
+ )
186
+
187
+ # Chat Controller
188
+ with gr.Column(elem_classes=["control-panel"]):
189
+ gr.Markdown("### 💬 Chat with agent")
190
+ msg_input = gr.Textbox(
191
+ placeholder="Type your message here... (e.g. 'what', 'how', or 'help')",
192
+ container=False,
193
+ scale=4,
194
+ )
195
+ send_btn = gr.Button("Send", scale=1, elem_classes="action-button")
196
+
197
+ # Event Handlers
198
+
199
+ find_btn.click(
200
+ fn=process_file_search_handler,
201
+ inputs=[lang_dropdown, k_input, chatbot],
202
+ outputs=[chatbot, msg_input, status_display, control_tabs],
203
+ )
204
+
205
+ # Sync language across tabs
206
+ lang_dropdown.change(
207
+ fn=sync_language_displays,
208
+ inputs=[lang_dropdown],
209
+ outputs=[translate_lang_display],
210
+ )
211
+
212
+ # Button event handlers
213
+ start_translate_btn.click(
214
+ fn=start_translate_handler,
215
+ inputs=[chatbot, anthropic_key],
216
+ outputs=[chatbot, msg_input, status_display, control_tabs],
217
+ )
218
+
219
+ # GitHub Config Save
220
+ save_config_btn.click(
221
+ fn=update_github_config,
222
+ inputs=[github_token, github_owner, github_repo, reference_pr_url],
223
+ outputs=[msg_input],
224
+ )
225
+
226
+ approve_btn.click(
227
+ fn=approve_handler,
228
+ inputs=[chatbot, github_owner, github_repo, reference_pr_url],
229
+ outputs=[chatbot, msg_input, status_display],
230
+ )
231
+
232
+ restart_btn.click(
233
+ fn=restart_handler,
234
+ inputs=[chatbot],
235
+ outputs=[chatbot, msg_input, status_display, control_tabs],
236
+ )
237
+
238
+ send_btn.click(
239
+ fn=send_message,
240
+ inputs=[msg_input, chatbot],
241
+ outputs=[chatbot, msg_input, status_display],
242
+ )
243
+
244
+ msg_input.submit(
245
+ fn=send_message,
246
+ inputs=[msg_input, chatbot],
247
+ outputs=[chatbot, msg_input, status_display],
248
+ )
249
+
250
+ root_path = os.environ.get("GRADIO_ROOT_PATH")
251
+ demo.launch(root_path=root_path)
example.env ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ANTHROPIC_API_KEY=<your api key>
2
+
3
+ # GitHub PR Agent Configuration
4
+ GITHUB_TOKEN=<your github token>
5
+ GITHUB_OWNER=<your github username>
6
+ GITHUB_REPO=<your repository name>
7
+ REFERENCE_PR_URL=<reference pr url for style analysis>
images/hfkr_logo.png ADDED
index.html ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width" />
6
+ <title>My static Space</title>
7
+ <link rel="stylesheet" href="style.css" />
8
+ </head>
9
+ <body>
10
+ <div class="card">
11
+ <h1>Welcome to your static Space!</h1>
12
+ <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
+ <p>
14
+ Also don't forget to check the
15
+ <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
+ </p>
17
+ </div>
18
+ </body>
19
+ </html>
pr_generator/agent.py ADDED
@@ -0,0 +1,593 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GitHub PR creation agent using Langchain.
3
+ This code integrates with the actual GitHub API using the PyGithub library.
4
+ Please set the GITHUB_TOKEN environment variable and install required libraries before running.
5
+ """
6
+
7
+ import os
8
+ import re
9
+ import json
10
+ from typing import Optional, Dict, List, Tuple, Any
11
+
12
+ # Load environment variables from .env file
13
+ from dotenv import load_dotenv
14
+
15
+ load_dotenv()
16
+
17
+ # Constants definition
18
+ ANTHROPIC_MODEL_ID = "claude-sonnet-4-20250514"
19
+ DEFAULT_TEMPERATURE = 0.0
20
+
21
+ # Library imports and error handling
22
+ try:
23
+ from github import Github, GithubException
24
+ from github.GitRef import GitRef
25
+ from langchain_anthropic import ChatAnthropic
26
+
27
+ REQUIRED_LIBS_AVAILABLE = True
28
+ except ImportError as e:
29
+ print(f"Required libraries are not installed: {e}")
30
+ print("Please run: pip install PyGithub boto3 langchain-anthropic")
31
+ REQUIRED_LIBS_AVAILABLE = False
32
+
33
+
34
+ class GitHubPRAgent:
35
+ """Agent class for GitHub PR creation"""
36
+
37
+ def __init__(self):
38
+ self._github_client = None
39
+ self._llm = None
40
+
41
+ @property
42
+ def github_client(self) -> Optional[Github]:
43
+ """Return GitHub API client with lazy initialization."""
44
+ if not REQUIRED_LIBS_AVAILABLE:
45
+ raise ImportError("Required libraries not found.")
46
+
47
+ if self._github_client is None:
48
+ token = os.environ.get("GITHUB_TOKEN")
49
+ if not token:
50
+ print("Warning: GITHUB_TOKEN environment variable not set.")
51
+ return Github() # Limited access
52
+ self._github_client = Github(token)
53
+
54
+ return self._github_client
55
+
56
+ @property
57
+ def llm(self):
58
+ """Return LLM client with lazy initialization."""
59
+ if not REQUIRED_LIBS_AVAILABLE:
60
+ raise ImportError("Required libraries not found.")
61
+
62
+ if self._llm is None:
63
+ self._llm = ChatAnthropic(
64
+ model=ANTHROPIC_MODEL_ID,
65
+ temperature=DEFAULT_TEMPERATURE,
66
+ )
67
+ return self._llm
68
+
69
+ def _handle_github_error(self, e: Exception, operation: str) -> str:
70
+ """Handle GitHub API errors consistently."""
71
+ if isinstance(e, GithubException):
72
+ return f"{operation} failed: {e.status} {e.data.get('message', e.data)}"
73
+ return f"Unexpected error during {operation}: {str(e)}"
74
+
75
+ def create_pull_request(
76
+ self,
77
+ owner: str,
78
+ repo_name: str,
79
+ title: str,
80
+ head: str,
81
+ base: str,
82
+ body: str = "",
83
+ draft: bool = False,
84
+ maintainer_can_modify: bool = True,
85
+ ) -> str:
86
+ """Create a new Pull Request."""
87
+ try:
88
+ # 1. Check if head and base are the same
89
+ if head == base:
90
+ return f"ERROR: head branch ({head}) and base branch ({base}) are identical."
91
+
92
+ # 2. Check for existing PR
93
+ existing_pr = self.check_existing_pr(owner, repo_name, head, base)
94
+ if existing_pr:
95
+ return f"ERROR: {existing_pr}"
96
+
97
+ # 3. Verify head branch exists
98
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
99
+ try:
100
+ head_branch = repo.get_branch(head)
101
+ base_branch = repo.get_branch(base)
102
+
103
+ # 4. Check if head and base branches point to the same commit
104
+ if head_branch.commit.sha == base_branch.commit.sha:
105
+ return f"ERROR: head branch ({head}) and base branch ({base}) point to the same commit. No changes to merge."
106
+
107
+ except GithubException as e:
108
+ if e.status == 404:
109
+ return f"ERROR: Branch not found. head: {head}, base: {base}"
110
+
111
+ # 5. Create PR
112
+ pr = repo.create_pull(
113
+ title=title,
114
+ body=body,
115
+ head=head,
116
+ base=base,
117
+ draft=draft,
118
+ maintainer_can_modify=maintainer_can_modify,
119
+ )
120
+ return f"PR creation successful: {pr.html_url}"
121
+ except GithubException as e:
122
+ if e.status == 422:
123
+ error_msg = e.data.get("message", "Unknown error")
124
+ errors = e.data.get("errors", [])
125
+
126
+ error_details = []
127
+ for error in errors:
128
+ if "message" in error:
129
+ error_details.append(error["message"])
130
+
131
+ detail_msg = " | ".join(error_details) if error_details else ""
132
+ return f"ERROR: PR creation failed (422): {error_msg}. {detail_msg}"
133
+ return self._handle_github_error(e, "PR creation")
134
+ except Exception as e:
135
+ return self._handle_github_error(e, "PR creation")
136
+
137
+ def create_branch(
138
+ self, owner: str, repo_name: str, branch_name: str, source_sha: str
139
+ ) -> str:
140
+ """Create a new branch."""
141
+ try:
142
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
143
+ ref_name = f"refs/heads/{branch_name}"
144
+ new_ref = repo.create_git_ref(ref=ref_name, sha=source_sha)
145
+
146
+ if isinstance(new_ref, GitRef):
147
+ return f"SUCCESS: Branch '{branch_name}' created successfully (ref: {new_ref.ref})"
148
+ return f"ERROR: Branch '{branch_name}' creation failed. Please check API response."
149
+ except GithubException as e:
150
+ if e.status == 422 and "Reference already exists" in str(e.data):
151
+ return f"WARNING: Branch '{branch_name}' already exists."
152
+ return self._handle_github_error(e, "branch creation")
153
+ except Exception as e:
154
+ return self._handle_github_error(e, "branch creation")
155
+
156
+ def check_existing_pr(
157
+ self, owner: str, repo_name: str, head: str, base: str
158
+ ) -> Optional[str]:
159
+ """Check if there's an existing PR with the same head and base."""
160
+ try:
161
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
162
+ pulls = repo.get_pulls(state="open", head=f"{owner}:{head}", base=base)
163
+ for pr in pulls:
164
+ return f"Existing PR found: {pr.html_url}"
165
+ return None
166
+ except Exception as e:
167
+ print(f"⚠️ Error checking existing PR: {str(e)}")
168
+ return None
169
+
170
+ def create_or_update_file(
171
+ self,
172
+ owner: str,
173
+ repo_name: str,
174
+ path: str,
175
+ message: str,
176
+ content: str,
177
+ branch_name: Optional[str] = None,
178
+ sha_blob: Optional[str] = None,
179
+ ) -> str:
180
+ """Create or update a single file."""
181
+ try:
182
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
183
+
184
+ args = {
185
+ "path": path,
186
+ "message": message,
187
+ "content": content,
188
+ }
189
+ if branch_name:
190
+ args["branch"] = branch_name
191
+
192
+ # Try to update file
193
+ if sha_blob:
194
+ args["sha"] = sha_blob
195
+ repo.update_file(**args)
196
+ return f"SUCCESS: File updated - {path}"
197
+
198
+ # Try to create file
199
+ repo.create_file(**args)
200
+ return f"SUCCESS: File created - {path}"
201
+
202
+ except GithubException as e:
203
+ # Try to update if file already exists
204
+ if e.status == 422:
205
+ try:
206
+ existing_file = repo.get_contents(
207
+ path, ref=branch_name or repo.default_branch
208
+ )
209
+ args["sha"] = existing_file.sha
210
+ repo.update_file(**args)
211
+ return f"SUCCESS: File updated - {path}"
212
+ except:
213
+ pass
214
+ return f"ERROR: File processing failed - {path}"
215
+ except Exception:
216
+ return f"ERROR: File processing failed - {path}"
217
+
218
+ def analyze_reference_pr(self, pr_url: str) -> Dict[str, Any]:
219
+ """Analyze reference PR to extract style information."""
220
+ try:
221
+ # Parse PR URL
222
+ match = re.match(r"https://github\.com/([^/]+)/([^/]+)/pull/(\d+)", pr_url)
223
+ if not match:
224
+ return {"error": f"Invalid PR URL format: {pr_url}"}
225
+
226
+ owner, repo_name, pr_number = match.groups()
227
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
228
+ pr = repo.get_pull(int(pr_number))
229
+
230
+ return {
231
+ "title": pr.title,
232
+ "body": pr.body,
233
+ "head_branch": pr.head.ref,
234
+ "base_branch": pr.base.ref,
235
+ "files_changed": [f.filename for f in pr.get_files()],
236
+ "commits": [
237
+ {"message": c.commit.message, "sha": c.sha}
238
+ for c in pr.get_commits()
239
+ ],
240
+ }
241
+ except Exception as e:
242
+ return {"error": f"Error occurred during PR analysis: {str(e)}"}
243
+
244
+ def _generate_with_llm(
245
+ self, prompt: str, fallback_value: str, operation: str
246
+ ) -> str:
247
+ """Generate text using LLM."""
248
+ try:
249
+ response = self.llm.invoke(prompt)
250
+ generated = response.content.strip()
251
+ print(f"LLM generated {operation}: {generated}")
252
+ return generated
253
+ except Exception as e:
254
+ print(f"❌ Error generating {operation} with LLM: {e}")
255
+ print(f"Using fallback value: {fallback_value}")
256
+ return fallback_value
257
+
258
+ def generate_branch_name_from_reference(
259
+ self, reference_branch_name: str, target_language: str, file_name: str
260
+ ) -> str:
261
+ """Use LLM to analyze reference PR information and generate appropriate branch name."""
262
+ prompt = f"""Here is the reference PR information:
263
+
264
+ Reference PR branch name: {reference_branch_name}
265
+
266
+ Now I need to generate a branch name for a new translation task:
267
+ - Target language: {target_language}
268
+ - File to translate: {file_name}
269
+
270
+ Please analyze the pattern and style of the reference PR title to generate a consistent new branch name.
271
+
272
+ Requirements:
273
+ 1. Follow the naming conventions and patterns of the reference PR
274
+ 2. Appropriately reflect the target language ({target_language}) and file name ({file_name}) if applicable
275
+
276
+ Please return only the branch name. No other explanation is needed."""
277
+
278
+ fallback = f"translate-{target_language}-{file_name.replace('_', '-')}"
279
+ return self._generate_with_llm(prompt, fallback, "branch name")
280
+
281
+ def generate_pr_content_from_reference(
282
+ self,
283
+ reference_title: str,
284
+ reference_body: str,
285
+ target_language: str,
286
+ filepath: str,
287
+ target_filepath: str,
288
+ file_name: str,
289
+ ) -> Tuple[str, str]:
290
+ """Use LLM to analyze reference PR title and body and generate appropriate PR content."""
291
+ prompt = f"""Here is the reference PR information:
292
+
293
+ Reference PR title: {reference_title}
294
+
295
+ Reference PR body:
296
+ {reference_body}
297
+
298
+ Now I need to generate PR title and body for a new translation task:
299
+ - Target language: {target_language}
300
+ - Original file: {filepath}
301
+ - Translation file: {target_filepath}
302
+ - File name: {file_name}
303
+
304
+ Please analyze the style and format of the reference PR to generate consistent new PR title and body.
305
+
306
+ Requirements:
307
+ 1. Follow the title format and pattern of the reference PR
308
+ 2. Maintain the body style, markdown format, indentation, and line breaks of the reference PR
309
+ 3. Appropriately reflect the target language ({target_language}) and file paths
310
+ 4. If there are user mentions (@username), change them to general text instead of actual mentions
311
+ 5. Adjust the content to fit the translation task
312
+
313
+ Response format:
314
+ Title: [PR title here]
315
+ Body: [PR body here, maintaining the exact markdown format and structure of the original]"""
316
+
317
+ try:
318
+ response = self.llm.invoke(prompt)
319
+ generated_content = response.content.strip()
320
+
321
+ # Separate title and body from response
322
+ lines = generated_content.split("\n")
323
+ title_line = ""
324
+ body_lines = []
325
+ parsing_body = False
326
+
327
+ for line in lines:
328
+ if line.startswith("Title:"):
329
+ title_line = line.replace("Title:", "").strip()
330
+ elif line.startswith("Body:"):
331
+ parsing_body = True
332
+ body_content = line.replace("Body:", "").strip()
333
+ if body_content:
334
+ body_lines.append(body_content)
335
+ elif parsing_body:
336
+ body_lines.append(line)
337
+
338
+ generated_title = title_line if title_line else reference_title
339
+ generated_body = (
340
+ "\n".join(body_lines)
341
+ if body_lines
342
+ else f"Add {target_language} translation for `{filepath}`."
343
+ )
344
+
345
+ print(f"LLM generated PR title: {generated_title}")
346
+ print(f"LLM generated PR body (first 100 chars): {generated_body[:100]}...")
347
+
348
+ return generated_title, generated_body
349
+
350
+ except Exception as e:
351
+ print(f"❌ Error generating PR content with LLM: {e}")
352
+ return self._generate_default_pr_content(
353
+ target_language, filepath, target_filepath, file_name
354
+ )
355
+
356
+ def _generate_default_pr_content(
357
+ self, target_language: str, filepath: str, target_filepath: str, file_name: str
358
+ ) -> Tuple[str, str]:
359
+ """Generate default PR content."""
360
+ title = f"[i18n-{target_language}] Add {target_language} translation for {file_name}"
361
+ body = f"""## Summary
362
+ Add {target_language} translation for `{filepath}`.
363
+
364
+ ## Changes
365
+ - Add {target_language} translation: `{target_filepath}`
366
+ - Original file: `{filepath}`
367
+ """
368
+ return title, body
369
+
370
+ def generate_commit_message_from_reference(
371
+ self, commit_messages: List[str], target_language: str, file_name: str
372
+ ) -> str:
373
+ """Use LLM to analyze reference PR commit messages and generate appropriate commit message."""
374
+ commits_text = (
375
+ "\n".join([f"- {msg}" for msg in commit_messages])
376
+ if commit_messages
377
+ else "None"
378
+ )
379
+
380
+ prompt = f"""Here are the commit messages from the reference PR:
381
+
382
+ {commits_text}
383
+
384
+ Now I need to generate a commit message for a new translation task:
385
+ - Target language: {target_language}
386
+ - File to translate: {file_name}
387
+
388
+ Please analyze the commit message patterns and style of the reference PR to generate a consistent new commit message.
389
+
390
+ Requirements:
391
+ 1. Follow the commit message style and format of the reference PR
392
+ 2. Appropriately reflect the target language ({target_language}) and file name ({file_name})
393
+ 3. Follow general Git commit message conventions
394
+ 4. Be concise and clear
395
+ 5. If you detect typos in the given commit messages, use corrected versions (e.g., dos -> docs)
396
+
397
+ Please return only the commit message. No other explanation is needed."""
398
+
399
+ fallback = f"docs: add {target_language} translation for {file_name}"
400
+ return self._generate_with_llm(prompt, fallback, "commit message")
401
+
402
+ def get_branch_info(self, owner: str, repo_name: str, branch_name: str) -> str:
403
+ """Get information about an existing branch."""
404
+ try:
405
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
406
+ branch = repo.get_branch(branch_name)
407
+ commit = branch.commit
408
+ commit_info = commit.commit
409
+
410
+ return f"""
411
+ 📋 Existing branch information:
412
+ - Branch name: {branch_name}
413
+ - Latest commit: {commit.sha[:8]}
414
+ - Commit message: {commit_info.message.split(chr(10))[0][:80]}...
415
+ - Author: {commit_info.author.name}
416
+ - Date: {commit_info.author.date.strftime('%Y-%m-%d %H:%M:%S')}
417
+ """
418
+ except Exception as e:
419
+ return f"Failed to retrieve branch information: {str(e)}"
420
+
421
+ def run_translation_pr_workflow(
422
+ self,
423
+ reference_pr_url: str,
424
+ target_language: str,
425
+ filepath: str,
426
+ translated_doc: str,
427
+ owner: str,
428
+ repo_name: str,
429
+ base_branch: str = "main",
430
+ ) -> Dict[str, Any]:
431
+ """Execute translation document PR creation workflow."""
432
+ try:
433
+ # 1. Analyze reference PR
434
+ print(f"🔍 Analyzing reference PR: {reference_pr_url}")
435
+ pr_analysis = self.analyze_reference_pr(reference_pr_url)
436
+
437
+ if "error" in pr_analysis:
438
+ return {"status": "error", "message": pr_analysis["error"]}
439
+
440
+ print("Reference PR analysis completed")
441
+
442
+ # 2. Generate translation file path and branch name
443
+ target_filepath = filepath.replace("/en/", f"/{target_language}/")
444
+ file_name = filepath.split("/")[-1].replace(".md", "")
445
+
446
+ print(f"🌿 Generating branch name...")
447
+ branch_name = self.generate_branch_name_from_reference(
448
+ pr_analysis["head_branch"], target_language, file_name
449
+ )
450
+
451
+ # 3. Get main branch SHA and create branch
452
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
453
+ main_branch = repo.get_branch(base_branch)
454
+ main_sha = main_branch.commit.sha
455
+
456
+ print(f"🌿 Creating branch: {branch_name}")
457
+ branch_result = self.create_branch(owner, repo_name, branch_name, main_sha)
458
+
459
+ # Check branch creation result
460
+ if branch_result.startswith("ERROR"):
461
+ return {
462
+ "status": "error",
463
+ "message": f"Branch creation failed: {branch_result}",
464
+ "branch": branch_name,
465
+ }
466
+ elif branch_result.startswith("WARNING"):
467
+ print(f"⚠️ {branch_result}")
468
+ # Continue if branch already exists
469
+ else:
470
+ print(f"{branch_result}")
471
+
472
+ # 4. Generate commit message and save file
473
+ commit_messages = [commit["message"] for commit in pr_analysis["commits"]]
474
+ commit_message = self.generate_commit_message_from_reference(
475
+ commit_messages, target_language, file_name
476
+ )
477
+
478
+ print(f"📄 Saving file: {target_filepath}")
479
+ file_result = self.create_or_update_file(
480
+ owner,
481
+ repo_name,
482
+ target_filepath,
483
+ commit_message,
484
+ translated_doc,
485
+ branch_name,
486
+ )
487
+
488
+ if not file_result.startswith("SUCCESS"):
489
+ return {
490
+ "status": "error",
491
+ "message": "An issue occurred while saving the file.",
492
+ "branch": branch_name,
493
+ "file_path": target_filepath,
494
+ }
495
+
496
+ print(f"{file_result}")
497
+
498
+ # 5. Create PR
499
+ pr_title, pr_body = self.generate_pr_content_from_reference(
500
+ pr_analysis["title"],
501
+ pr_analysis["body"],
502
+ target_language,
503
+ filepath,
504
+ target_filepath,
505
+ file_name,
506
+ )
507
+
508
+ print(f"🔄 Creating PR: {pr_title}")
509
+ print(f" Head: {branch_name} → Base: {base_branch}")
510
+
511
+ pr_result = self.create_pull_request(
512
+ owner, repo_name, pr_title, branch_name, base_branch, pr_body
513
+ )
514
+
515
+ if pr_result.startswith("ERROR"):
516
+ print(f"❌ {pr_result}")
517
+ return {
518
+ "status": "partial_success",
519
+ "branch": branch_name,
520
+ "file_path": target_filepath,
521
+ "message": f"File was saved but PR creation failed: {pr_result}",
522
+ "error_details": pr_result,
523
+ }
524
+ elif "successful" in pr_result and "http" in pr_result:
525
+ print(f"{pr_result}")
526
+ return {
527
+ "status": "success",
528
+ "branch": branch_name,
529
+ "file_path": target_filepath,
530
+ "pr_url": pr_result.split(": ")[-1],
531
+ "message": "Translation document PR created successfully!",
532
+ }
533
+ else:
534
+ return {
535
+ "status": "partial_success",
536
+ "branch": branch_name,
537
+ "file_path": target_filepath,
538
+ "message": "File was saved but PR creation failed.",
539
+ }
540
+
541
+ except Exception as e:
542
+ return {
543
+ "status": "error",
544
+ "message": f"Error occurred during workflow execution: {str(e)}",
545
+ }
546
+
547
+
548
+ # Backward compatibility functions (maintain compatibility with existing code)
549
+ _agent = GitHubPRAgent()
550
+
551
+
552
+ def get_github_client():
553
+ return _agent.github_client
554
+
555
+
556
+ def create_pull_request_func(*args, **kwargs):
557
+ return _agent.create_pull_request(*args, **kwargs)
558
+
559
+
560
+ def create_branch_func(*args, **kwargs):
561
+ return _agent.create_branch(*args, **kwargs)
562
+
563
+
564
+ def create_or_update_file_func(*args, **kwargs):
565
+ return _agent.create_or_update_file(*args, **kwargs)
566
+
567
+
568
+ def analyze_reference_pr_func(*args, **kwargs):
569
+ return _agent.analyze_reference_pr(*args, **kwargs)
570
+
571
+
572
+ def generate_branch_name_from_reference(*args, **kwargs):
573
+ return _agent.generate_branch_name_from_reference(*args, **kwargs)
574
+
575
+
576
+ def generate_pr_content_from_reference(*args, **kwargs):
577
+ return _agent.generate_pr_content_from_reference(*args, **kwargs)
578
+
579
+
580
+ def generate_default_pr_content(*args, **kwargs):
581
+ return _agent._generate_default_pr_content(*args, **kwargs)
582
+
583
+
584
+ def generate_commit_message_from_reference(*args, **kwargs):
585
+ return _agent.generate_commit_message_from_reference(*args, **kwargs)
586
+
587
+
588
+ def get_branch_info(*args, **kwargs):
589
+ return _agent.get_branch_info(*args, **kwargs)
590
+
591
+
592
+ def run_translation_pr_agent_simple(*args, **kwargs):
593
+ return _agent.run_translation_pr_workflow(*args, **kwargs)
pr_generator/searcher.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GitHub PR Search Agent
3
+ An agent that finds a suitable reference PR when a reference PR URL is not provided.
4
+ """
5
+
6
+ import os
7
+ import re
8
+ import logging
9
+ from typing import List, Dict, Any, Optional
10
+
11
+ # Load environment variables
12
+ from dotenv import load_dotenv
13
+
14
+ load_dotenv()
15
+
16
+ # Setup logging
17
+ logging.basicConfig(
18
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
19
+ )
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # Langchain imports
23
+ try:
24
+ from langchain_anthropic import ChatAnthropic
25
+ from langchain.tools import StructuredTool
26
+ from langchain.agents import AgentExecutor, create_tool_calling_agent
27
+ from langchain_core.prompts import ChatPromptTemplate
28
+ from github import Github
29
+
30
+ REQUIRED_LIBS_AVAILABLE = True
31
+ except ImportError as e:
32
+ print(f"Required libraries are not installed: {e}")
33
+ REQUIRED_LIBS_AVAILABLE = False
34
+
35
+ # Constants
36
+ ANTHROPIC_MODEL_ID = "claude-sonnet-4-20250514"
37
+ DEFAULT_TEMPERATURE = 0.0
38
+ # Fallback PR URL to ensure a PR is always returned
39
+ DEFAULT_FALLBACK_PR_URL = "https://github.com/huggingface/transformers/pull/24968"
40
+
41
+
42
+ class GitHubPRSearcher:
43
+ """GitHub PR Searcher - now using a LangChain agent."""
44
+
45
+ def _search_github_prs(self, query: str) -> List[Dict[str, Any]]:
46
+ """
47
+ Searches GitHub for pull requests matching the query and returns the top 5 results.
48
+ The query should be a valid GitHub search query.
49
+ """
50
+ logger.info(f"Executing GitHub search with query: {query}")
51
+ try:
52
+ issues = self.github_client.search_issues(query=query)
53
+ # Take top 5 to keep context small for the agent
54
+ top_issues = issues.get_page(0)[:5]
55
+
56
+ if not top_issues:
57
+ return []
58
+
59
+ return [
60
+ {"title": issue.title, "url": issue.html_url, "number": issue.number}
61
+ for issue in top_issues
62
+ ]
63
+ except Exception as e:
64
+ logger.error(f"Error during GitHub search: {e}", exc_info=True)
65
+ # Return an error message that the agent can understand
66
+ return [{"error": f"An error occurred during search: {e}"}]
67
+
68
+ def __init__(self):
69
+ if not REQUIRED_LIBS_AVAILABLE:
70
+ raise ImportError("Required libraries for agent could not be found.")
71
+
72
+ self._github_client = None
73
+ self.llm = ChatAnthropic(
74
+ model=ANTHROPIC_MODEL_ID,
75
+ temperature=DEFAULT_TEMPERATURE,
76
+ )
77
+
78
+ search_tool = StructuredTool.from_function(
79
+ func=self._search_github_prs,
80
+ name="search_github_prs",
81
+ description="Searches GitHub for pull requests matching the query and returns the top 5 results. The query should be a valid GitHub search query.",
82
+ )
83
+ tools = [search_tool]
84
+
85
+ prompt_string = """You are a GitHub expert. Your mission is to find the best reference pull request (PR) for a given task.
86
+
87
+ You need to find a merged PR in the repository: {owner}/{repo_name}.
88
+ The PR should be for a documentation translation into **{target_language}**.
89
+ The context for the translation is: **{context}**.
90
+
91
+ Use the tools at your disposal to search for relevant PRs.
92
+ Analyze the search results and select the one that best matches the request. A good PR is usually one that has "translation", "docs", "i18n", and the target language in its title.
93
+
94
+ Here is an example of a good search query you could use:
95
+ `repo:{owner}/{repo_name} is:pr is:merged "{target_language}" "{context}" i18n translation docs`
96
+
97
+ After your analysis, you MUST output **only the final URL** of the best PR you have chosen. Do not include any other text in your final response."""
98
+
99
+ prompt = ChatPromptTemplate.from_messages(
100
+ [
101
+ ("system", prompt_string),
102
+ (
103
+ "human",
104
+ "Find the best reference PR for translating docs to {target_language} about {context} in the {owner}/{repo_name} repository.",
105
+ ),
106
+ ("placeholder", "{agent_scratchpad}"),
107
+ ]
108
+ )
109
+
110
+ agent = create_tool_calling_agent(self.llm, tools, prompt)
111
+ self.agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)
112
+
113
+ @property
114
+ def github_client(self) -> Optional[Github]:
115
+ """Lazy initialization of the GitHub API client."""
116
+ if not REQUIRED_LIBS_AVAILABLE:
117
+ raise ImportError("Required libraries could not be found.")
118
+
119
+ if self._github_client is None:
120
+ token = os.environ.get("GITHUB_TOKEN")
121
+ if not token:
122
+ print("Warning: GITHUB_TOKEN environment variable is not set.")
123
+ self._github_client = Github() # Limited access
124
+ else:
125
+ self._github_client = Github(token)
126
+ return self._github_client
127
+
128
+ def find_best_reference_pr(
129
+ self, owner: str, repo_name: str, target_language: str, context: str
130
+ ):
131
+ """
132
+ Finds the best reference PR using a LangChain agent.
133
+ Yields progress and returns the final PR URL.
134
+ """
135
+ message = "🤖 Agent is searching for the best reference PR..."
136
+ logger.info(message)
137
+ yield message
138
+
139
+ try:
140
+ agent_input = {
141
+ "owner": owner,
142
+ "repo_name": repo_name,
143
+ "target_language": target_language,
144
+ "context": context,
145
+ }
146
+
147
+ agent_output = None
148
+ for event in self.agent_executor.stream(agent_input):
149
+ if "actions" in event and event["actions"]:
150
+ action = event["actions"][0]
151
+ tool_query = action.tool_input.get("query", str(action.tool_input))
152
+ message = f"🔍 Agent is using tool `{action.tool}` with query:\n`{tool_query}`"
153
+ logger.info(message)
154
+ yield message
155
+ elif "steps" in event and event["steps"]:
156
+ message = "📊 Agent is analyzing the results from the tool..."
157
+ logger.info(message)
158
+ yield message
159
+ elif "output" in event and event["output"]:
160
+ agent_output = event["output"]
161
+
162
+ if not agent_output:
163
+ message = "⚠️ Agent failed to find a suitable PR. Using default PR."
164
+ logger.warning(message)
165
+ yield message
166
+ return DEFAULT_FALLBACK_PR_URL
167
+
168
+ # The agent's final output can be a string, a list of tool results,
169
+ # or a list of content blocks from the LLM. We'll find the URL
170
+ # by searching for it in the string representation of the output.
171
+ output_text = str(agent_output)
172
+ urls = re.findall(r"https?://github.com/[^/]+/[^/]+/pull/\d+", output_text)
173
+
174
+ final_url = ""
175
+ if urls:
176
+ final_url = urls[-1] # Take the last URL found
177
+
178
+ if not final_url:
179
+ message = f"⚠️ Agent returned unparsable output: {agent_output}. Using default PR."
180
+ logger.warning(message)
181
+ yield message
182
+ return DEFAULT_FALLBACK_PR_URL
183
+
184
+ message = f"✅ Selected the best PR:\n`{final_url}`"
185
+ logger.info(f"Selected the best PR: {final_url}")
186
+ yield message
187
+ return final_url
188
+
189
+ except Exception as e:
190
+ message = f"❌ Error during agent execution: {e}\nUsing default PR."
191
+ logger.error(message, exc_info=True)
192
+ yield message
193
+ return DEFAULT_FALLBACK_PR_URL
194
+
195
+
196
+ def find_reference_pr_simple_stream(target_language: str = "", context: str = ""):
197
+ """
198
+ A simple function to find a reference PR, streaming progress.
199
+ This function always searches in the 'huggingface/transformers' repository.
200
+ """
201
+ searcher = GitHubPRSearcher()
202
+ stream_generator = searcher.find_best_reference_pr(
203
+ "huggingface", "transformers", target_language, context
204
+ )
205
+ # The handler will receive the final URL from the generator's return statement
206
+ final_url = yield from stream_generator
207
+
208
+ # Format the final result as expected by the handler
209
+ return {
210
+ "status": "success",
211
+ "result": f"Recommended PR URL: {final_url}",
212
+ "repository": "huggingface/transformers",
213
+ "target_language": target_language,
214
+ }
215
+
216
+
217
+ # Example usage
218
+ if __name__ == "__main__":
219
+ # Example execution for streaming
220
+ # In a real application, a generator consumer (like the one in handler.py)
221
+ # would process the yielded values. This script simulates that.
222
+ print("--- Running Streaming Search Simulation ---")
223
+
224
+ def run_simulation():
225
+ """Simulates the consumption of the streaming generator."""
226
+ test_gen = find_reference_pr_simple_stream(
227
+ target_language="korean", context="docs"
228
+ )
229
+ try:
230
+ while True:
231
+ # This will print progress messages
232
+ print(next(test_gen))
233
+ except StopIteration as e:
234
+ # When the generator is exhausted, the final result is in e.value
235
+ print("\n--- FINAL RESULT ---")
236
+ print(e.value)
237
+
238
+ run_simulation()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio==5.33.0
2
+ requests
3
+ pydantic
4
+ langchain-anthropic
5
+ python-dotenv
6
+ langchain
7
+ PyGithub
8
+ langchain-core
9
+ langchain-community
style.css ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ padding: 2rem;
3
+ font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
4
+ }
5
+
6
+ h1 {
7
+ font-size: 16px;
8
+ margin-top: 0;
9
+ }
10
+
11
+ p {
12
+ color: rgb(107, 114, 128);
13
+ font-size: 15px;
14
+ margin-bottom: 10px;
15
+ margin-top: 5px;
16
+ }
17
+
18
+ .card {
19
+ max-width: 620px;
20
+ margin: 0 auto;
21
+ padding: 16px;
22
+ border: 1px solid lightgray;
23
+ border-radius: 16px;
24
+ }
25
+
26
+ .card p:last-child {
27
+ margin-bottom: 0;
28
+ }
test/__init__.py ADDED
File without changes
test/test_final_translate.md ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2025 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # 가속기 선택 [[accelerator-selection]]
18
+
19
+ 분산 훈련 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 개수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기들이 있을 때 더 빠른 가속기를 먼저 사용하고 싶거나, 사용 가능한 가속기 중 일부만 사용하고 싶을 때 유용합니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
20
+
21
+ 이 가이드에서는 사용할 가속기의 개수와 사용 순서를 선택하는 방법을 보여드립니다.
22
+
23
+ ## 가속기 개수 [[number-of-accelerators]]
24
+
25
+ 예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령어를 실행하세요.
26
+
27
+ <hfoptions id="select-accelerator">
28
+ <hfoption id="torchrun">
29
+
30
+ 사용할 가속기 개수를 선택하려면 `--nproc_per_node`를 사용하세요.
31
+
32
+ ```bash
33
+ torchrun --nproc_per_node=2 trainer-program.py ...
34
+ ```
35
+
36
+ </hfoption>
37
+ <hfoption id="Accelerate">
38
+
39
+ 사용할 가속기 개수를 선택하려면 `--num_processes`를 사용하세요.
40
+
41
+ ```bash
42
+ accelerate launch --num_processes 2 trainer-program.py ...
43
+ ```
44
+
45
+ </hfoption>
46
+ <hfoption id="🤗 DeepSpeed">
47
+
48
+ 사용할 GPU 개수를 선택하려면 `--num_gpus`를 사용하세요.
49
+
50
+ ```bash
51
+ deepspeed --num_gpus 2 trainer-program.py ...
52
+ ```
53
+
54
+ </hfoption>
55
+ </hfoptions>
56
+
57
+ ## 가속기 순서 [[order-of-accelerators]]
58
+ 사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 보통 각 실행마다 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
59
+
60
+ 예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
61
+
62
+ <hfoptions id="accelerator-type">
63
+ <hfoption id="CUDA">
64
+
65
+ ```bash
66
+ CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
67
+ ```
68
+
69
+ GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
70
+ 순서를 바꾸려면(GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
71
+
72
+
73
+ ```bash
74
+ CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
75
+ ```
76
+
77
+ GPU 없이 실행하려면:
78
+
79
+ ```bash
80
+ CUDA_VISIBLE_DEVICES= python trainer-program.py ...
81
+ ```
82
+
83
+ `CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
84
+
85
+ - PCIe 버스 ID 순서(`nvidia-smi`와 일치):
86
+
87
+ ```bash
88
+ $hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
89
+ ```
90
+
91
+ - 컴퓨팅 성능 순서(가장 빠른 것부터):
92
+
93
+ ```bash
94
+ export CUDA_DEVICE_ORDER=FASTEST_FIRST
95
+ ```
96
+
97
+ </hfoption>
98
+ <hfoption id="Intel XPU">
99
+
100
+ ```bash
101
+ ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
102
+ ```
103
+
104
+ XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
105
+ 순서를 바꾸려면(XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
106
+
107
+ ```bash
108
+ ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
109
+ ```
110
+
111
+
112
+ 다음으로 Intel XPU의 순서를 제어할 수도 있습니다:
113
+
114
+ ```bash
115
+ export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
116
+ ```
117
+
118
+ Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
119
+
120
+ </hfoption>
121
+ </hfoptions>
122
+
123
+
124
+
125
+ > [!WARNING]
126
+ > 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신 특정 훈련 실행을 위한 환경 변수를 같은 명령줄에서 설정하는 것이 일반적인 관행입니다.
127
+
test/test_prompt.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ output = """
2
+ What do these sentences about Hugging Face Transformers (a machine learning library) mean in Korean? Please do not translate the word after a 🤗 emoji as it is a product name. Output only the translated result without any explanations or introductions.
3
+ ```md
4
+ # Accelerator selection
5
+
6
+ During distributed training, you can specify the number and order of accelerators (CUDA, XPU, MPS, HPU, etc.) to use. This can be useful when you have accelerators with different computing power and you want to use the faster accelerator first. Or you could only use a subset of the available accelerators. The selection process works for both [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html). You don't need Accelerate or [DeepSpeed integration](./main_classes/deepspeed).
7
+
8
+ This guide will show you how to select the number of accelerators to use and the order to use them in.
9
+
10
+ ## Number of accelerators
11
+
12
+ For example, if there are 4 accelerators and you only want to use the first 2, run the command below.
13
+
14
+ <hfoptions id="select-accelerator">
15
+ <hfoption id="torchrun">
16
+
17
+ Use the `--nproc_per_node` to select how many accelerators to use.
18
+
19
+ </hfoption>
20
+ <hfoption id="Accelerate">
21
+
22
+ Use `--num_processes` to select how many accelerators to use.
23
+
24
+ </hfoption>
25
+ <hfoption id="DeepSpeed">
26
+
27
+ Use `--num_gpus` to select how many GPUs to use.
28
+
29
+ </hfoption>
30
+ </hfoptions>
31
+
32
+ ## Order of accelerators
33
+ To select specific accelerators to use and their order, use the environment variable appropriate for your hardware. This is often set on the command line for each run, but can also be added to your `~/.bashrc` or other startup config file.
34
+
35
+ For example, if there are 4 accelerators (0, 1, 2, 3) and you only want to run accelerators 0 and 2:
36
+
37
+ <hfoptions id="accelerator-type">
38
+ <hfoption id="CUDA">
39
+
40
+ Only GPUs 0 and 2 are "visible" to PyTorch and are mapped to `cuda:0` and `cuda:1` respectively.
41
+ To reverse the order (use GPU 2 as `cuda:0` and GPU 0 as `cuda:1`):
42
+
43
+ To run without any GPUs:
44
+
45
+ You can also control the order of CUDA devices using `CUDA_DEVICE_ORDER`:
46
+
47
+ - Order by PCIe bus ID (matches `nvidia-smi`):
48
+
49
+
50
+
51
+ - Order by compute capability (fastest first):
52
+
53
+
54
+
55
+ </hfoption>
56
+ <hfoption id="Intel XPU">
57
+
58
+ Only XPUs 0 and 2 are "visible" to PyTorch and are mapped to `xpu:0` and `xpu:1` respectively.
59
+ To reverse the order (use XPU 2 as `xpu:0` and XPU 0 as `xpu:1`):
60
+
61
+ You can also control the order of Intel XPUs with:
62
+
63
+ For more information about device enumeration and sorting on Intel XPU, please refer to the [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) documentation.
64
+
65
+ </hfoption>
66
+ </hfoptions>
67
+
68
+ > [!WARNING]
69
+ > Environment variables can be exported instead of being added to the command line. This is not recommended because it can be confusing if you forget how the environment variable was set up and you end up using the wrong accelerators. Instead, it is common practice to set the environment variable for a specific training run on the same command line.
70
+ ```
71
+ """
test/test_translate.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ translated_content = """
2
+ # 가속기 선택
3
+
4
+ 분산 훈련 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 개수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기들이 있을 때 더 빠른 가속기를 먼저 사용하고 싶거나, 사용 가능한 가속기 중 일부만 사용하고 싶을 때 유용합니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
5
+
6
+ 이 가이드에서는 사용할 가속기의 개수와 사용 순서를 선택하는 방법을 보여드립니다.
7
+
8
+ ## 가속기 개수
9
+
10
+ 예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령어를 실행하세요.
11
+
12
+ <hfoptions id="select-accelerator">
13
+ <hfoption id="torchrun">
14
+
15
+ 사용할 가속기 개수를 선택하려면 `--nproc_per_node`를 사용하세요.
16
+
17
+ </hfoption>
18
+ <hfoption id="Accelerate">
19
+
20
+ 사용할 가속기 개수를 선택하려면 `--num_processes`를 사용하세요.
21
+
22
+ </hfoption>
23
+ <hfoption id="🤗 DeepSpeed">
24
+
25
+ 사용할 GPU 개수를 선택하려면 `--num_gpus`를 사용하세요.
26
+
27
+ </hfoption>
28
+ </hfoptions>
29
+
30
+ ## 가속기 순서
31
+ 사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 보통 각 실행마다 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
32
+
33
+ 예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
34
+
35
+ <hfoptions id="accelerator-type">
36
+ <hfoption id="CUDA">
37
+
38
+ GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
39
+ 순서를 바꾸려면(GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
40
+
41
+ GPU 없이 실행하려면:
42
+
43
+ `CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
44
+
45
+ - PCIe 버스 ID 순서(`nvidia-smi`와 일치):
46
+
47
+
48
+
49
+ - 컴퓨팅 성능 순서(가장 빠른 것부터):
50
+
51
+
52
+
53
+ </hfoption>
54
+ <hfoption id="Intel XPU">
55
+
56
+ XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
57
+ 순서를 바꾸려면(XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
58
+
59
+ 다음으로 Intel XPU의 순서를 제어할 수도 있습니다:
60
+
61
+ Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
62
+
63
+ </hfoption>
64
+ </hfoptions>
65
+
66
+ > [!WARNING]
67
+ > 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신 특정 훈련 실행을 위한 환경 변수를 같은 명령줄에서 설정하는 것이 일반적인 관행입니다.
68
+ """
translation_result/docs/source/.DS_Store ADDED
Binary file (6.15 kB). View file
 
translation_result/docs/source/en/accelerator_selection.md ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2025 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # 가속기 선택 [[accelerator-selection]]
18
+
19
+ 분산 훈련 중에 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 수와 순서를 지정할 수 있습니다. 이는 서로 다른 연산 성능을 가진 가속기가 있고 더 빠른 가속기를 먼저 사용하고 싶을 때 유용할 수 있습니다. 또는 사용 가능한 가속기 중 일부만 사용할 수도 있습니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
20
+
21
+ 이 가이드는 사용할 가속기의 수와 사용 순서를 선택하는 방법을 보여줍니다.
22
+
23
+ ## 가속기 수 [[number-of-accelerators]]
24
+
25
+ 예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령을 실행하세요.
26
+
27
+ <hfoptions id="select-accelerator">
28
+ <hfoption id="torchrun">
29
+
30
+ `--nproc_per_node`를 사용하여 사용할 가속기 수를 선택하세요.
31
+
32
+ ```bash
33
+ torchrun --nproc_per_node=2 trainer-program.py ...
34
+ ```
35
+
36
+ </hfoption>
37
+ <hfoption id="Accelerate">
38
+
39
+ `--num_processes`를 사용하여 사용할 가속기 수를 선택하세요.
40
+
41
+ ```bash
42
+ accelerate launch --num_processes 2 trainer-program.py ...
43
+ ```
44
+
45
+ </hfoption>
46
+ <hfoption id="DeepSpeed">
47
+
48
+ `--num_gpus`를 사용하여 사용할 GPU 수를 선택하세요.
49
+
50
+ ```bash
51
+ deepspeed --num_gpus 2 trainer-program.py ...
52
+ ```
53
+
54
+ </hfoption>
55
+ </hfoptions>
56
+
57
+ ## 가속기 순서 [[order-of-accelerators]]
58
+ 사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 각 실행마다 명령줄에서 설정되는 경우가 많지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
59
+
60
+ 예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
61
+
62
+ <hfoptions id="accelerator-type">
63
+ <hfoption id="CUDA">
64
+
65
+ ```bash
66
+ CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
67
+ ```
68
+
69
+ GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
70
+ 순서를 바꾸려면 (GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
71
+
72
+
73
+ ```bash
74
+ CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
75
+ ```
76
+
77
+ GPU 없이 실행하려면:
78
+
79
+ ```bash
80
+ CUDA_VISIBLE_DEVICES= python trainer-program.py ...
81
+ ```
82
+
83
+ `CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치 순서를 제어할 수도 있습니다:
84
+
85
+ - PCIe 버스 ID 순서로 정렬 (`nvidia-smi`와 일치):
86
+
87
+ ```bash
88
+ $hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
89
+ ```
90
+
91
+ - 연산 성능 순서로 정렬 (가장 빠른 것부터):
92
+
93
+ ```bash
94
+ export CUDA_DEVICE_ORDER=FASTEST_FIRST
95
+ ```
96
+
97
+ </hfoption>
98
+ <hfoption id="Intel XPU">
99
+
100
+ ```bash
101
+ ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
102
+ ```
103
+
104
+ XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
105
+ 순서를 바꾸려면 (XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
106
+
107
+ ```bash
108
+ ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
109
+ ```
110
+
111
+
112
+ 다음으로 Intel XPU 순서를 제어할 수도 있습니다:
113
+
114
+ ```bash
115
+ export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
116
+ ```
117
+
118
+ Intel XPU의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
119
+
120
+ </hfoption>
121
+ </hfoptions>
122
+
123
+
124
+
125
+ > [!WARNING]
126
+ > 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 결국 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신, 동일한 명령줄에서 특정 훈련 실행에 대해 환경 변수를 설정하는 것이 일반적인 관례입니다.
127
+ ```
translator/__init__.py ADDED
File without changes
translator/content.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+
4
+ import requests
5
+ from langchain.callbacks import get_openai_callback
6
+ from langchain_anthropic import ChatAnthropic
7
+
8
+
9
+ def get_content(filepath: str) -> str:
10
+ url = string.Template(
11
+ "https://raw.githubusercontent.com/huggingface/" "transformers/main/$filepath"
12
+ ).safe_substitute(filepath=filepath)
13
+ response = requests.get(url)
14
+ if response.status_code == 200:
15
+ content = response.text
16
+ return content
17
+ else:
18
+ raise ValueError("Failed to retrieve content from the URL.", url)
19
+
20
+
21
+ def preprocess_content(content: str) -> str:
22
+ # Extract text to translate from document
23
+
24
+ ## ignore top license comment
25
+ to_translate = content[content.find("#") :]
26
+ ## remove code blocks from text
27
+ to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
28
+ ## remove markdown tables from text
29
+ to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
30
+ ## remove empty lines from text
31
+ to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
32
+
33
+ return to_translate
34
+
35
+
36
+ def get_full_prompt(language: str, to_translate: str) -> str:
37
+ prompt = string.Template(
38
+ "What do these sentences about Hugging Face Transformers "
39
+ "(a machine learning library) mean in $language? "
40
+ "Please do not translate the word after a 🤗 emoji "
41
+ "as it is a product name. Output only the translated markdown result "
42
+ "without any explanations or introductions.\n\n```md"
43
+ ).safe_substitute(language=language)
44
+ return "\n".join([prompt, to_translate.strip(), "```"])
45
+
46
+
47
+ def split_markdown_sections(markdown: str) -> list:
48
+ # Find all titles using regular expressions
49
+ return re.split(r"^(#+\s+)(.*)$", markdown, flags=re.MULTILINE)[1:]
50
+ # format is like [level, title, content, level, title, content, ...]
51
+
52
+
53
+ def get_anchors(divided: list) -> list:
54
+ anchors = []
55
+ # from https://github.com/huggingface/doc-builder/blob/01b262bae90d66e1150cdbf58c83c02733ed4366/src/doc_builder/build_doc.py#L300-L302
56
+ for title in divided[1::3]:
57
+ anchor = re.sub(r"[^a-z0-9\s]+", "", title.lower())
58
+ anchor = re.sub(r"\s{2,}", " ", anchor.strip()).replace(" ", "-")
59
+ anchors.append(f"[[{anchor}]]")
60
+ return anchors
61
+
62
+
63
+ def make_scaffold(content: str, to_translate: str) -> string.Template:
64
+ scaffold = content
65
+ for i, text in enumerate(to_translate.split("\n\n")):
66
+ scaffold = scaffold.replace(text, f"$hf_i18n_placeholder{i}", 1)
67
+ return string.Template(scaffold)
68
+
69
+
70
+ def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
71
+ scaffold = make_scaffold(content, to_translate)
72
+ divided = split_markdown_sections(to_translate)
73
+ anchors = get_anchors(divided)
74
+
75
+ translated = split_markdown_sections(translated)
76
+
77
+ translated[1::3] = [
78
+ f"{korean_title} {anchors[i]}"
79
+ for i, korean_title in enumerate(translated[1::3])
80
+ ]
81
+ translated = "".join(
82
+ ["".join(translated[i * 3 : i * 3 + 3]) for i in range(len(translated) // 3)]
83
+ ).split("\n\n")
84
+ if newlines := scaffold.template.count("$hf_i18n_placeholder") - len(translated):
85
+ return str(
86
+ [
87
+ f"Please {'recover' if newlines > 0 else 'remove'} "
88
+ f"{abs(newlines)} incorrectly inserted double newlines."
89
+ ]
90
+ )
91
+
92
+ translated_doc = scaffold.safe_substitute(
93
+ {f"hf_i18n_placeholder{i}": text for i, text in enumerate(translated)}
94
+ )
95
+ return translated_doc
96
+
97
+
98
+ def llm_translate(to_translate: str) -> tuple[str, str]:
99
+ with get_openai_callback() as cb:
100
+ model = ChatAnthropic(
101
+ model="claude-sonnet-4-20250514", max_tokens=64000, streaming=True
102
+ )
103
+ ai_message = model.invoke(to_translate)
104
+ print("cb:", cb)
105
+ return cb, ai_message.content
translator/model.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum, unique
2
+
3
+ from pydantic import BaseModel, computed_field
4
+
5
+
6
+ @unique
7
+ class Languages(Enum):
8
+ az = "az"
9
+ bn = "bn"
10
+ de = "de"
11
+ em = "em"
12
+ es = "es"
13
+ fa = "fa"
14
+ fr = "fr"
15
+ he = "he"
16
+ hu = "hu"
17
+ id = "id"
18
+ it = "it"
19
+ ja = "ja"
20
+ ko = "ko"
21
+ pl = "pl"
22
+ pt = "pt"
23
+ ru = "ru"
24
+ tr = "tr"
25
+ uk = "uk"
26
+ ur = "ur"
27
+ vi = "vi"
28
+ yo = "yo"
29
+ zh = "zh"
30
+ zh_hant = "zh-hant"
31
+
32
+
33
+ class TranslationDoc(BaseModel):
34
+ official_lang: str = "en"
35
+ translation_lang: str
36
+ original_file: str
37
+ translation_file: str | None = None
38
+ translation_exists: bool
39
+
40
+
41
+ class Summary(BaseModel):
42
+ lang: str
43
+ files_analyzed: int = 0
44
+ files_translated: int = 0
45
+ files_outdated: int = 0
46
+ files_missing_translation: int = 0
47
+ files: list[TranslationDoc] = []
48
+
49
+ @computed_field # type: ignore
50
+ @property
51
+ def percentage_missing_translation(self) -> float:
52
+ try:
53
+ return (
54
+ 100 * float(self.files_missing_translation) / float(self.files_analyzed)
55
+ )
56
+ except Exception:
57
+ return 0.0
58
+
59
+ def append_file(self, doc: TranslationDoc) -> None:
60
+ self.files.append(doc)
61
+ self.files_analyzed += 1
62
+
63
+ if doc.translation_exists:
64
+ self.files_translated += 1
65
+
66
+ if not doc.translation_exists:
67
+ self.files_missing_translation += 1
68
+
69
+ def first_missing_translation_files(self, length: int = 10) -> list[TranslationDoc]:
70
+ return list(filter(lambda d: not d.translation_exists, self.files))[:length]
translator/retriever.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import requests
5
+
6
+ from .model import Languages, Summary, TranslationDoc
7
+
8
+ URL = "https://api.github.com/repos/huggingface/transformers/git/trees/main?recursive=1"
9
+
10
+
11
+ def get_github_repo_files():
12
+ """
13
+ Get github repo files
14
+ """
15
+ response = requests.get(URL)
16
+
17
+ data = response.json()
18
+ all_items = data.get("tree", [])
19
+
20
+ file_paths = [
21
+ item["path"]
22
+ for item in all_items
23
+ if item["type"] == "blob" and (item["path"].startswith("docs"))
24
+ ]
25
+ return file_paths
26
+
27
+
28
+ def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
29
+ """
30
+ Retrieve missing docs
31
+ """
32
+
33
+ report = f"""
34
+ | Item | Count | Percentage |
35
+ |------|-------|------------|
36
+ | 📂 HuggingFaces docs | {summary.files_analyzed} | - |
37
+ | 🪹 Missing translations | {summary.files_missing_translation} | {summary.percentage_missing_translation:.2f}% |
38
+ """
39
+ print(report)
40
+ first_missing_docs = list()
41
+ for file in summary.first_missing_translation_files(table_size):
42
+ first_missing_docs.append(file.original_file)
43
+
44
+ print(first_missing_docs)
45
+ return report, first_missing_docs
46
+
47
+
48
+ def report(target_lang: str, top_k: int = 1) -> tuple[str, list[str]]:
49
+ """
50
+ Generate a report for the translated docs
51
+ """
52
+ docs_file = get_github_repo_files()
53
+
54
+ base_docs_path = Path("docs/source")
55
+ en_docs_path = Path("docs/source/en")
56
+
57
+ lang = Languages[target_lang]
58
+ summary = Summary(lang=lang.value)
59
+
60
+ for file in docs_file:
61
+ if file.endswith(".md"):
62
+ try:
63
+ file_relative_path = Path(file).relative_to(en_docs_path)
64
+ except ValueError:
65
+ continue
66
+
67
+ translated_path = os.path.join(
68
+ base_docs_path, lang.value, file_relative_path
69
+ )
70
+ translation_exists = translated_path in docs_file
71
+
72
+ doc = TranslationDoc(
73
+ translation_lang=lang.value,
74
+ original_file=file,
75
+ translation_file=translated_path,
76
+ translation_exists=translation_exists,
77
+ )
78
+ summary.append_file(doc)
79
+ return retrieve(summary, top_k)