Upload project files
Browse files- README.md +5 -9
- agent/handler.py +386 -0
- agent/workflow.py +205 -0
- app.py +251 -0
- example.env +7 -0
- images/hfkr_logo.png +0 -0
- index.html +19 -0
- pr_generator/agent.py +593 -0
- pr_generator/searcher.py +238 -0
- requirements.txt +9 -0
- style.css +28 -0
- test/__init__.py +0 -0
- test/test_final_translate.md +127 -0
- test/test_prompt.py +71 -0
- test/test_translate.py +68 -0
- translation_result/docs/source/.DS_Store +0 -0
- translation_result/docs/source/en/accelerator_selection.md +127 -0
- translator/__init__.py +0 -0
- translator/content.py +105 -0
- translator/model.py +70 -0
- translator/retriever.py +79 -0
README.md
CHANGED
@@ -1,14 +1,10 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.33.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
license: apache-2.0
|
11 |
-
short_description: Translation agent for Hugging Face Transformers docs
|
12 |
---
|
13 |
-
|
14 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: I18n Agent
|
3 |
+
emoji: 🤖
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: "5.33.0"
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
|
|
10 |
---
|
|
|
|
agent/handler.py
ADDED
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Module for gradio chat-based translation agent interface."""
|
2 |
+
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
import gradio as gr
|
8 |
+
|
9 |
+
from agent.workflow import (
|
10 |
+
report_translation_target_files,
|
11 |
+
translate_docs_interactive,
|
12 |
+
generate_github_pr,
|
13 |
+
)
|
14 |
+
from pr_generator.searcher import find_reference_pr_simple_stream
|
15 |
+
|
16 |
+
|
17 |
+
# State management
|
18 |
+
class ChatState:
|
19 |
+
def __init__(self):
|
20 |
+
self.step = "welcome" # welcome -> find_files -> translate -> create_github_pr
|
21 |
+
self.target_language = "ko"
|
22 |
+
self.k_files = 10
|
23 |
+
self.files_to_translate = []
|
24 |
+
self.current_file_content = {"translated": ""}
|
25 |
+
self.pr_result = None # Store PR creation result
|
26 |
+
# GitHub configuration
|
27 |
+
self.github_config = {
|
28 |
+
"token": "",
|
29 |
+
"owner": "",
|
30 |
+
"repo_name": "",
|
31 |
+
"reference_pr_url": "https://github.com/huggingface/transformers/pull/24968",
|
32 |
+
}
|
33 |
+
|
34 |
+
|
35 |
+
state = ChatState()
|
36 |
+
|
37 |
+
|
38 |
+
def _extract_content_for_display(content: str) -> str:
|
39 |
+
"""Extract text from document for display."""
|
40 |
+
# Remove Copyright header
|
41 |
+
to_translate = re.sub(r"<!--.*?-->", "", content, count=1, flags=re.DOTALL)
|
42 |
+
to_translate = to_translate.strip()
|
43 |
+
## remove code blocks from text
|
44 |
+
to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
|
45 |
+
## remove markdown tables from text
|
46 |
+
to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
|
47 |
+
## remove empty lines from text
|
48 |
+
to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
|
49 |
+
|
50 |
+
return to_translate
|
51 |
+
|
52 |
+
|
53 |
+
def get_welcome_message():
|
54 |
+
"""Initial welcome message with file finding controls"""
|
55 |
+
return """**👋 Welcome to 🌐 Hugging Face i18n Translation Agent!**
|
56 |
+
|
57 |
+
I'll help you find files that need translation and translate them in a streamlined workflow.
|
58 |
+
|
59 |
+
**🔎 Let's start by finding files that need translation.**
|
60 |
+
|
61 |
+
Use the **`Quick Controls`** on the right or **ask me `what`, `how`, or `help`** to get started.
|
62 |
+
"""
|
63 |
+
|
64 |
+
|
65 |
+
def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
|
66 |
+
"""Process file search request and update Gradio UI components."""
|
67 |
+
global state
|
68 |
+
state.target_language = lang
|
69 |
+
state.k_files = k
|
70 |
+
state.step = "find_files"
|
71 |
+
|
72 |
+
status_report, files_list = report_translation_target_files(lang, k)
|
73 |
+
state.files_to_translate = [file[0] for file in files_list] if files_list else []
|
74 |
+
|
75 |
+
response = f"""**✅ File search completed!**
|
76 |
+
|
77 |
+
**Status Report:**
|
78 |
+
{status_report}
|
79 |
+
|
80 |
+
**📁 Found first {len(state.files_to_translate)} files to translate:**
|
81 |
+
"""
|
82 |
+
|
83 |
+
if state.files_to_translate:
|
84 |
+
for i, file in enumerate(state.files_to_translate[:5], 1): # Show first 5
|
85 |
+
response += f"\n{i}. `{file}`"
|
86 |
+
|
87 |
+
if len(state.files_to_translate) > 5:
|
88 |
+
response += f"\n... and {len(state.files_to_translate) - 5} more files"
|
89 |
+
|
90 |
+
response += "\n\n**🚀 Ready to start translation?**\nI can begin translating these files one by one. Would you like to proceed?"
|
91 |
+
else:
|
92 |
+
response += "\nNo files found that need translation."
|
93 |
+
|
94 |
+
# Add to history
|
95 |
+
history.append(["Please find files that need translation", response])
|
96 |
+
cleared_input = ""
|
97 |
+
selected_tab = 1 if state.files_to_translate else 0
|
98 |
+
|
99 |
+
return history, cleared_input, update_status(), gr.Tabs(selected=selected_tab)
|
100 |
+
|
101 |
+
|
102 |
+
def start_translation_process():
|
103 |
+
"""Start the translation process for the first file"""
|
104 |
+
if not state.files_to_translate:
|
105 |
+
return "❌ No files available for translation."
|
106 |
+
|
107 |
+
current_file = state.files_to_translate[0]
|
108 |
+
|
109 |
+
# Call translation function (simplified for demo)
|
110 |
+
try:
|
111 |
+
status, translated = translate_docs_interactive(
|
112 |
+
state.target_language, [[current_file]]
|
113 |
+
)
|
114 |
+
|
115 |
+
state.current_file_content = {"translated": translated}
|
116 |
+
path = (
|
117 |
+
Path(__file__).resolve().parent.parent
|
118 |
+
/ f"translation_result/{current_file}"
|
119 |
+
)
|
120 |
+
p = Path(path)
|
121 |
+
p.parent.mkdir(parents=True, exist_ok=True)
|
122 |
+
p.write_text(translated, encoding="utf-8")
|
123 |
+
|
124 |
+
original_file_link = (
|
125 |
+
"https://github.com/huggingface/transformers/blob/main/" + current_file
|
126 |
+
)
|
127 |
+
response = (
|
128 |
+
f"""🔄 Translation for: `{current_file}`**\n"""
|
129 |
+
"**📄 Original Content Link:**\n"
|
130 |
+
""
|
131 |
+
f"{original_file_link}\n"
|
132 |
+
"**🌐 Translated Content:**\n"
|
133 |
+
f"\n```\n\n{_extract_content_for_display(translated)}```\n"
|
134 |
+
f"{status}\n"
|
135 |
+
)
|
136 |
+
print("translated:")
|
137 |
+
print(translated)
|
138 |
+
print("extracted")
|
139 |
+
|
140 |
+
except Exception as e:
|
141 |
+
response = f"❌ Translation failed: {str(e)}"
|
142 |
+
response += "\n**➡️ Please try from the beginning.**"
|
143 |
+
|
144 |
+
return response
|
145 |
+
|
146 |
+
|
147 |
+
def handle_general_message(message):
|
148 |
+
"""Handle general messages"""
|
149 |
+
message_lower = message.lower()
|
150 |
+
|
151 |
+
if any(word in message_lower for word in ["help", "what", "how"]):
|
152 |
+
return """**🤖 I'm your Hugging Face i18n Translation Agent!**
|
153 |
+
|
154 |
+
I can help you:
|
155 |
+
1. **🔍 Find files** that need translation
|
156 |
+
2. **🌐 Translate documents** using AI
|
157 |
+
3. **📋 Review translations** for quality
|
158 |
+
4. **🚀 Create GitHub PR** for translation
|
159 |
+
|
160 |
+
Currently available actions with quick controls:
|
161 |
+
- "find files" - Search for files needing translation
|
162 |
+
- "translate" - Start translation process
|
163 |
+
- "review" - Review current translation
|
164 |
+
- "github" - Create GitHub Pull Request
|
165 |
+
- "restart" - Start over"""
|
166 |
+
|
167 |
+
elif "restart" in message_lower:
|
168 |
+
global state
|
169 |
+
state = ChatState()
|
170 |
+
return get_welcome_message()
|
171 |
+
|
172 |
+
else:
|
173 |
+
return """I understand you want to work on translations!
|
174 |
+
|
175 |
+
To get started, please use the controls above to configure your translation settings and find files that need translation.
|
176 |
+
"""
|
177 |
+
|
178 |
+
|
179 |
+
# Main handler
|
180 |
+
def handle_user_message(message, history):
|
181 |
+
"""Handle user messages and provide appropriate responses"""
|
182 |
+
global state
|
183 |
+
|
184 |
+
if not message.strip():
|
185 |
+
return history, ""
|
186 |
+
|
187 |
+
elif state.step == "find_files" and any(
|
188 |
+
word in message.lower()
|
189 |
+
for word in ["yes", "proceed", "start", "translate", "translation"]
|
190 |
+
):
|
191 |
+
# User wants to start translation
|
192 |
+
if state.files_to_translate:
|
193 |
+
state.step = "translate"
|
194 |
+
response = start_translation_process()
|
195 |
+
else:
|
196 |
+
response = (
|
197 |
+
"❌ No files available for translation. Please search for files first."
|
198 |
+
)
|
199 |
+
|
200 |
+
# Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
|
201 |
+
else:
|
202 |
+
# General response
|
203 |
+
response = handle_general_message(message)
|
204 |
+
|
205 |
+
history.append([message, response])
|
206 |
+
return history, ""
|
207 |
+
|
208 |
+
|
209 |
+
def update_status():
|
210 |
+
if state.step == "welcome":
|
211 |
+
return """
|
212 |
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px;padding: 10px; background: rgba(0, 0, 0, 0.25); border-radius: 8px;">
|
213 |
+
<div><strong>🔄 Step:</strong> Welcome</div>
|
214 |
+
<div><strong>📁 Files:</strong> 0</div>
|
215 |
+
<div><strong>🌍 Language:</strong> ko</div>
|
216 |
+
<div><strong>⏳ Progress:</strong> Ready</div>
|
217 |
+
</div>
|
218 |
+
"""
|
219 |
+
|
220 |
+
step_map = {
|
221 |
+
"welcome": "Welcome",
|
222 |
+
"find_files": "Finding Files",
|
223 |
+
"translate": "Translating",
|
224 |
+
"review": "Reviewing",
|
225 |
+
"create_github_pr": "Creating PR",
|
226 |
+
}
|
227 |
+
|
228 |
+
progress_map = {
|
229 |
+
"welcome": "Ready to start",
|
230 |
+
"find_files": "Files found",
|
231 |
+
"translate": f"{len(state.files_to_translate)} remaining",
|
232 |
+
"review": "Review complete",
|
233 |
+
"create_github_pr": "PR generation in progress",
|
234 |
+
}
|
235 |
+
|
236 |
+
# Check GitHub configuration status
|
237 |
+
github_status = "❌ Not configured"
|
238 |
+
if all(
|
239 |
+
[
|
240 |
+
state.github_config["token"],
|
241 |
+
state.github_config["owner"],
|
242 |
+
state.github_config["repo_name"],
|
243 |
+
]
|
244 |
+
):
|
245 |
+
github_status = (
|
246 |
+
f"✅ {state.github_config['owner']}/{state.github_config['repo_name']}"
|
247 |
+
)
|
248 |
+
|
249 |
+
status_html = f"""
|
250 |
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px; padding: 10px; background: rgba(0, 0, 0, 0.25); border-radius: 8px;">
|
251 |
+
<div><strong>🔄 Step:</strong> {step_map.get(state.step, state.step)}</div>
|
252 |
+
<div><strong>📁 Files:</strong> {len(state.files_to_translate)}</div>
|
253 |
+
<div><strong>🌍 Language:</strong> {state.target_language}</div>
|
254 |
+
<div><strong>⏳ Progress:</strong> {progress_map.get(state.step, 'In progress')}</div>
|
255 |
+
<div><strong>🔧 GitHub:</strong> {github_status}</div>
|
256 |
+
</div>
|
257 |
+
"""
|
258 |
+
|
259 |
+
return status_html
|
260 |
+
|
261 |
+
|
262 |
+
# Event handlers
|
263 |
+
|
264 |
+
|
265 |
+
def sync_language_displays(lang):
|
266 |
+
return lang
|
267 |
+
|
268 |
+
|
269 |
+
def update_github_config(token, owner, repo, reference_pr_url):
|
270 |
+
"""Update GitHub configuration settings."""
|
271 |
+
global state
|
272 |
+
|
273 |
+
# Set GitHub token in environment variables
|
274 |
+
if token:
|
275 |
+
os.environ["GITHUB_TOKEN"] = token
|
276 |
+
|
277 |
+
# Save GitHub configuration to state
|
278 |
+
state.github_config.update(
|
279 |
+
{
|
280 |
+
"token": token,
|
281 |
+
"owner": owner,
|
282 |
+
"repo_name": repo,
|
283 |
+
"reference_pr_url": reference_pr_url
|
284 |
+
or state.github_config["reference_pr_url"],
|
285 |
+
}
|
286 |
+
)
|
287 |
+
|
288 |
+
return f"✅ GitHub configuration updated: {owner}/{repo}"
|
289 |
+
|
290 |
+
|
291 |
+
def send_message(message, history):
|
292 |
+
new_history, cleared_input = handle_user_message(message, history)
|
293 |
+
return new_history, cleared_input, update_status()
|
294 |
+
|
295 |
+
|
296 |
+
# Button handlers with tab switching
|
297 |
+
def start_translate_handler(history, anthropic_key):
|
298 |
+
os.environ["ANTHROPIC_API_KEY"] = anthropic_key
|
299 |
+
new_hist, cleared_input = handle_user_message("start translation", history)
|
300 |
+
selected_tabs = 2 if state.current_file_content["translated"] else 0
|
301 |
+
return new_hist, cleared_input, update_status(), gr.Tabs(selected=selected_tabs)
|
302 |
+
|
303 |
+
|
304 |
+
def approve_handler(history, owner, repo, reference_pr_url):
|
305 |
+
"""Handles the request to generate a GitHub PR."""
|
306 |
+
global state
|
307 |
+
state.step = "create_github_pr"
|
308 |
+
|
309 |
+
# Update github config from the latest UI values
|
310 |
+
state.github_config["owner"] = owner
|
311 |
+
state.github_config["repo_name"] = repo
|
312 |
+
state.github_config["reference_pr_url"] = reference_pr_url
|
313 |
+
|
314 |
+
# Validate GitHub configuration
|
315 |
+
github_config = state.github_config
|
316 |
+
if not all([github_config.get("token"), owner, repo]):
|
317 |
+
response = "❌ GitHub configuration incomplete. Please provide GitHub Token, Owner, and Repository Name in Tab 3."
|
318 |
+
history.append(["GitHub PR creation request", response])
|
319 |
+
return history, "", update_status()
|
320 |
+
|
321 |
+
# If reference PR is not provided, use the agent to find one
|
322 |
+
if not github_config.get("reference_pr_url"):
|
323 |
+
response = "🤖 **Reference PR URL not found. The agent will now search for a suitable one...**"
|
324 |
+
try:
|
325 |
+
# This part is simplified to avoid streaming logic in a non-generator function
|
326 |
+
stream_gen = find_reference_pr_simple_stream(
|
327 |
+
target_language=state.target_language,
|
328 |
+
context="documentation translation",
|
329 |
+
)
|
330 |
+
# We will just get the final result from the generator
|
331 |
+
final_result = None
|
332 |
+
try:
|
333 |
+
while True:
|
334 |
+
# We are not interested in the streamed messages here, just the final result.
|
335 |
+
next(stream_gen)
|
336 |
+
except StopIteration as e:
|
337 |
+
final_result = e.value
|
338 |
+
|
339 |
+
if final_result and final_result.get("status") == "success":
|
340 |
+
result_text = final_result.get("result", "")
|
341 |
+
match = re.search(r"https://github.com/[^\s]+", result_text)
|
342 |
+
if match:
|
343 |
+
found_url = match.group(0)
|
344 |
+
state.github_config["reference_pr_url"] = found_url
|
345 |
+
response += f"\n✅ **Agent found a reference PR:** {found_url}"
|
346 |
+
else:
|
347 |
+
raise ValueError(
|
348 |
+
"Could not extract a valid PR URL from agent's response."
|
349 |
+
)
|
350 |
+
else:
|
351 |
+
error_message = final_result.get("message") or final_result.get(
|
352 |
+
"result", "Unknown error"
|
353 |
+
)
|
354 |
+
raise ValueError(f"Agent failed to find a PR. Reason: {error_message}")
|
355 |
+
except Exception as e:
|
356 |
+
response += f"\n❌ **Agent failed to find a reference PR.**\nReason: {e}\n\nPlease provide a reference PR URL manually in Tab 3 and try again."
|
357 |
+
history.append(["Agent searching for PR", response])
|
358 |
+
return history, "", update_status()
|
359 |
+
|
360 |
+
# Proceed with PR generation
|
361 |
+
if state.files_to_translate and state.current_file_content.get("translated"):
|
362 |
+
current_file = state.files_to_translate[0]
|
363 |
+
translated_content = state.current_file_content["translated"]
|
364 |
+
response += "\n\n🚀 **Generating GitHub PR...**"
|
365 |
+
|
366 |
+
pr_response = generate_github_pr(
|
367 |
+
target_language=state.target_language,
|
368 |
+
filepath=current_file,
|
369 |
+
translated_content=translated_content,
|
370 |
+
github_config=state.github_config,
|
371 |
+
)
|
372 |
+
response += f"\n{pr_response}"
|
373 |
+
else:
|
374 |
+
response = "❌ No translated file available. Please complete the translation process first."
|
375 |
+
|
376 |
+
history.append(["GitHub PR creation request", response])
|
377 |
+
return history, "", update_status()
|
378 |
+
|
379 |
+
|
380 |
+
def restart_handler(history):
|
381 |
+
"""Resets the state and UI."""
|
382 |
+
global state
|
383 |
+
state = ChatState()
|
384 |
+
welcome_msg = get_welcome_message()
|
385 |
+
new_hist = [[None, welcome_msg]]
|
386 |
+
return new_hist, "", update_status(), gr.Tabs(selected=0)
|
agent/workflow.py
ADDED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Module for gradio interfaces."""
|
2 |
+
|
3 |
+
import os
|
4 |
+
from pathlib import Path
|
5 |
+
import gradio as gr
|
6 |
+
|
7 |
+
from translator.content import (
|
8 |
+
fill_scaffold,
|
9 |
+
get_content,
|
10 |
+
get_full_prompt,
|
11 |
+
llm_translate,
|
12 |
+
preprocess_content,
|
13 |
+
)
|
14 |
+
from translator.retriever import report
|
15 |
+
|
16 |
+
# GitHub PR Agent import
|
17 |
+
try:
|
18 |
+
from pr_generator.agent import GitHubPRAgent
|
19 |
+
|
20 |
+
GITHUB_PR_AVAILABLE = True
|
21 |
+
except ImportError as e:
|
22 |
+
print(f"⚠️ GitHub PR Agent is not available: {e}")
|
23 |
+
GITHUB_PR_AVAILABLE = False
|
24 |
+
|
25 |
+
# GitHub configuration - must be provided by user or environment variables
|
26 |
+
|
27 |
+
|
28 |
+
def report_translation_target_files(
|
29 |
+
translate_lang: str, top_k: int = 1
|
30 |
+
) -> tuple[str, list[list[str]]]:
|
31 |
+
"""Return the top-k files that need translation.
|
32 |
+
|
33 |
+
Args:
|
34 |
+
translate_lang: Target language to translate
|
35 |
+
top_k: Number of top-first files to return for translation. (Default 1)
|
36 |
+
"""
|
37 |
+
status_report, filepath_list = report(translate_lang, top_k)
|
38 |
+
return status_report, [[file] for file in filepath_list]
|
39 |
+
|
40 |
+
|
41 |
+
def translate_docs(lang: str, file_path: str) -> tuple[str, str]:
|
42 |
+
"""Translate documentation."""
|
43 |
+
# step 1. Get content from file path
|
44 |
+
content = get_content(file_path)
|
45 |
+
to_translate = preprocess_content(content)
|
46 |
+
|
47 |
+
# step 2. Prepare prompt with docs content
|
48 |
+
if lang == "ko":
|
49 |
+
translation_lang = "Korean"
|
50 |
+
to_translate_with_prompt = get_full_prompt(translation_lang, to_translate)
|
51 |
+
|
52 |
+
# step 3. Translate with LLM
|
53 |
+
# TODO: MCP clilent 넘길 부분
|
54 |
+
callback_result, translated_content = llm_translate(to_translate_with_prompt)
|
55 |
+
|
56 |
+
# step 4. Add scaffold to translation result
|
57 |
+
translated_doc = fill_scaffold(content, to_translate, translated_content)
|
58 |
+
|
59 |
+
return callback_result, translated_doc
|
60 |
+
|
61 |
+
|
62 |
+
def translate_docs_interactive(
|
63 |
+
translate_lang: str, selected_files: list[list[str]]
|
64 |
+
) -> tuple[str, str, str]:
|
65 |
+
"""Interactive translation function that processes files one by one.
|
66 |
+
|
67 |
+
Args:
|
68 |
+
translate_lang: Target language to translate
|
69 |
+
selected_files: List of file paths to translate
|
70 |
+
"""
|
71 |
+
# Extract file paths from the dataframe format
|
72 |
+
file_paths = [row[0] for row in selected_files if row and len(row) > 0]
|
73 |
+
if not file_paths:
|
74 |
+
return (
|
75 |
+
"No files selected for translation.",
|
76 |
+
gr.update(visible=False),
|
77 |
+
gr.update(visible=False),
|
78 |
+
gr.update(visible=False),
|
79 |
+
[],
|
80 |
+
0,
|
81 |
+
)
|
82 |
+
|
83 |
+
# Start with the first file
|
84 |
+
current_file = file_paths[0]
|
85 |
+
|
86 |
+
status = f"✅ Translation completed: `{current_file}` → `{translate_lang}`\n\n"
|
87 |
+
callback_result, translated_content = translate_docs(translate_lang, current_file)
|
88 |
+
status += f"💰 Used token and cost: \n```\n{callback_result}\n```"
|
89 |
+
|
90 |
+
if len(file_paths) > 1:
|
91 |
+
status += f"\n### 📝 Note: Currently, only the first file has been translated.\n> The remaining {len(file_paths) - 1} files have not been processed yet, as the system is in its beta version"
|
92 |
+
|
93 |
+
return status, translated_content
|
94 |
+
|
95 |
+
|
96 |
+
def generate_github_pr(
|
97 |
+
target_language: str,
|
98 |
+
filepath: str,
|
99 |
+
translated_content: str = None,
|
100 |
+
github_config: dict = None,
|
101 |
+
) -> str:
|
102 |
+
"""Generate a GitHub PR for translated documentation.
|
103 |
+
|
104 |
+
Args:
|
105 |
+
target_language: Target language for translation (e.g., "ko")
|
106 |
+
filepath: Original file path (e.g., "docs/source/en/accelerator_selection.md")
|
107 |
+
translated_content: Translated content (if None, read from file)
|
108 |
+
github_config: GitHub configuration dictionary
|
109 |
+
|
110 |
+
Returns:
|
111 |
+
PR creation result message
|
112 |
+
"""
|
113 |
+
if not GITHUB_PR_AVAILABLE:
|
114 |
+
return "❌ GitHub PR Agent is not available. Please install required libraries."
|
115 |
+
|
116 |
+
if not github_config:
|
117 |
+
return "❌ GitHub configuration not provided."
|
118 |
+
|
119 |
+
# Validate required configuration
|
120 |
+
required_fields = ["token", "owner", "repo_name", "reference_pr_url"]
|
121 |
+
missing_fields = [
|
122 |
+
field for field in required_fields if not github_config.get(field)
|
123 |
+
]
|
124 |
+
|
125 |
+
if missing_fields:
|
126 |
+
return f"❌ Missing required configuration: {', '.join(missing_fields)}. Please provide these values."
|
127 |
+
|
128 |
+
# Set token in environment for the agent.
|
129 |
+
os.environ["GITHUB_TOKEN"] = github_config["token"]
|
130 |
+
|
131 |
+
try:
|
132 |
+
# Read translated content from file if not provided
|
133 |
+
if translated_content is None:
|
134 |
+
translation_file_path = (
|
135 |
+
Path(__file__).resolve().parent.parent
|
136 |
+
/ f"translation_result/{filepath}"
|
137 |
+
)
|
138 |
+
if not translation_file_path.exists():
|
139 |
+
return f"❌ Translation file not found: {translation_file_path}"
|
140 |
+
|
141 |
+
with open(translation_file_path, "r", encoding="utf-8") as f:
|
142 |
+
translated_content = f.read()
|
143 |
+
|
144 |
+
if not translated_content or not translated_content.strip():
|
145 |
+
return "❌ Translated content is empty."
|
146 |
+
|
147 |
+
# Execute GitHub PR Agent
|
148 |
+
print(f"🚀 Starting GitHub PR creation...")
|
149 |
+
print(f" 📁 File: {filepath}")
|
150 |
+
print(f" 🌍 Language: {target_language}")
|
151 |
+
print(f" 📊 Reference PR: {github_config['reference_pr_url']}")
|
152 |
+
print(
|
153 |
+
f" 🏠 Repository: {github_config['owner']}/{github_config['repo_name']}"
|
154 |
+
)
|
155 |
+
|
156 |
+
agent = GitHubPRAgent()
|
157 |
+
result = agent.run_translation_pr_workflow(
|
158 |
+
reference_pr_url=github_config["reference_pr_url"],
|
159 |
+
target_language=target_language,
|
160 |
+
filepath=filepath,
|
161 |
+
translated_doc=translated_content,
|
162 |
+
owner=github_config["owner"],
|
163 |
+
repo_name=github_config["repo_name"],
|
164 |
+
base_branch=github_config.get("base_branch", "main"),
|
165 |
+
)
|
166 |
+
|
167 |
+
# Process result
|
168 |
+
if result["status"] == "success":
|
169 |
+
return f"""✅ **GitHub PR Creation Successful!**
|
170 |
+
|
171 |
+
🔗 **PR URL:** {result["pr_url"]}
|
172 |
+
🌿 **Branch:** {result["branch"]}
|
173 |
+
📁 **File:** {result["file_path"]}
|
174 |
+
|
175 |
+
{result["message"]}"""
|
176 |
+
|
177 |
+
elif result["status"] == "partial_success":
|
178 |
+
return f"""⚠️ **Partial Success**
|
179 |
+
|
180 |
+
🌿 **Branch:** {result["branch"]}
|
181 |
+
📁 **File:** {result["file_path"]}
|
182 |
+
|
183 |
+
{result["message"]}
|
184 |
+
|
185 |
+
**Error Details:**
|
186 |
+
{result.get("error_details", "Unknown error")}"""
|
187 |
+
|
188 |
+
else:
|
189 |
+
return f"""❌ **GitHub PR Creation Failed**
|
190 |
+
|
191 |
+
**Error Message:**
|
192 |
+
{result["message"]}"""
|
193 |
+
|
194 |
+
except Exception as e:
|
195 |
+
error_msg = f"❌ Unexpected error occurred during PR creation: {str(e)}"
|
196 |
+
print(error_msg)
|
197 |
+
return error_msg
|
198 |
+
|
199 |
+
|
200 |
+
# Backward compatibility function (replaces old mock function)
|
201 |
+
def mock_generate_PR():
|
202 |
+
"""Backward compatibility function - returns warning message only"""
|
203 |
+
return (
|
204 |
+
"⚠️ mock_generate_PR() is deprecated. Please use generate_github_pr() instead."
|
205 |
+
)
|
app.py
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Module for gradio chat-based translation agent interface."""
|
2 |
+
|
3 |
+
import base64
|
4 |
+
import os
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
|
9 |
+
from agent.handler import (
|
10 |
+
approve_handler,
|
11 |
+
get_welcome_message,
|
12 |
+
process_file_search_handler,
|
13 |
+
restart_handler,
|
14 |
+
send_message,
|
15 |
+
start_translate_handler,
|
16 |
+
sync_language_displays,
|
17 |
+
update_status,
|
18 |
+
update_github_config,
|
19 |
+
)
|
20 |
+
from translator.model import Languages
|
21 |
+
|
22 |
+
load_dotenv()
|
23 |
+
|
24 |
+
|
25 |
+
css = """
|
26 |
+
.gradio-container {
|
27 |
+
background: linear-gradient(135deg, #ffeda7 0%, #ffbebf 100%);
|
28 |
+
}
|
29 |
+
.chat-container {
|
30 |
+
background: rgba(255, 255, 180, 0.25);
|
31 |
+
border-radius: 18px;
|
32 |
+
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
33 |
+
padding: 1.5em;
|
34 |
+
backdrop-filter: blur(8px);
|
35 |
+
border: 1px solid rgba(255,255,180,0.25);
|
36 |
+
width: 100%;
|
37 |
+
height: 100%;
|
38 |
+
}
|
39 |
+
.control-panel {
|
40 |
+
background: rgba(255, 255, 180, 0.25);
|
41 |
+
border-radius: 18px;
|
42 |
+
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
43 |
+
padding: 1.5em;
|
44 |
+
backdrop-filter: blur(8px);
|
45 |
+
border: 1px solid rgba(255,255,180,0.25);
|
46 |
+
width: 100%;
|
47 |
+
}
|
48 |
+
.status-card {
|
49 |
+
width: 100%
|
50 |
+
}
|
51 |
+
.action-button {
|
52 |
+
background: linear-gradient(135deg, #ff8c8c 0%, #f9a889 100%) !important;
|
53 |
+
color: white !important;
|
54 |
+
border: none !important;
|
55 |
+
font-weight: 600 !important;
|
56 |
+
box-shadow: 0 4px 12px rgba(0,0,0,0.15) !important;
|
57 |
+
transition: all 0.3s ease-in-out !important;
|
58 |
+
}
|
59 |
+
.action-button:hover {
|
60 |
+
background: linear-gradient(135deg, #f9a889 0%, #ff8c8c 100%) !important;
|
61 |
+
box-shadow: 0 6px 16px rgba(0,0,0,0.2) !important;
|
62 |
+
transform: translateY(-2px) !important;
|
63 |
+
}
|
64 |
+
|
65 |
+
.simple-tabs .tab-nav button {
|
66 |
+
background: transparent !important;
|
67 |
+
color: #4A5568 !important;
|
68 |
+
box-shadow: none !important;
|
69 |
+
transform: none !important;
|
70 |
+
border: none !important;
|
71 |
+
border-bottom: 2px solid #E2E8F0 !important;
|
72 |
+
border-radius: 0 !important;
|
73 |
+
font-weight: 600 !important;
|
74 |
+
}
|
75 |
+
|
76 |
+
.simple-tabs .tab-nav button.selected {
|
77 |
+
color: #f97316 !important;
|
78 |
+
border-bottom: 2px solid #f97316 !important;
|
79 |
+
}
|
80 |
+
|
81 |
+
.simple-tabs .tab-nav button:hover {
|
82 |
+
background: #f3f4f6 !important;
|
83 |
+
color: #f97316 !important;
|
84 |
+
box-shadow: none !important;
|
85 |
+
transform: none !important;
|
86 |
+
}
|
87 |
+
"""
|
88 |
+
|
89 |
+
|
90 |
+
# Create the main interface
|
91 |
+
with gr.Blocks(
|
92 |
+
css=css, title=" 🌐 Hugging Face Transformers Docs i18n made easy"
|
93 |
+
) as demo:
|
94 |
+
|
95 |
+
# Title
|
96 |
+
with open("images/hfkr_logo.png", "rb") as img_file:
|
97 |
+
base64_img = base64.b64encode(img_file.read()).decode()
|
98 |
+
gr.Markdown(
|
99 |
+
f'<img src="data:image/png;base64,{base64_img}" style="display: block; margin-left: auto; margin-right: auto; height: 15em;"/>'
|
100 |
+
)
|
101 |
+
gr.Markdown(
|
102 |
+
'<h1 style="text-align: center;"> 🌐 Hugging Face Transformers Docs i18n made easy</h1>'
|
103 |
+
)
|
104 |
+
|
105 |
+
# Content
|
106 |
+
with gr.Row():
|
107 |
+
# Chat interface
|
108 |
+
with gr.Column(scale=4, elem_classes=["chat-container"]):
|
109 |
+
gr.Markdown("### 🌐 Hugging Face i18n Agent")
|
110 |
+
|
111 |
+
chatbot = gr.Chatbot(
|
112 |
+
value=[[None, get_welcome_message()]], scale=1, height=585
|
113 |
+
)
|
114 |
+
|
115 |
+
# Controller interface
|
116 |
+
with gr.Column(scale=2):
|
117 |
+
# Quick Controller
|
118 |
+
with gr.Column(elem_classes=["control-panel"]):
|
119 |
+
gr.Markdown("### 🛠️ Quick Controls")
|
120 |
+
status_display = gr.HTML(update_status())
|
121 |
+
|
122 |
+
with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
|
123 |
+
with gr.TabItem("1. Find Files", id=0):
|
124 |
+
with gr.Group():
|
125 |
+
lang_dropdown = gr.Dropdown(
|
126 |
+
choices=[language.value for language in Languages],
|
127 |
+
label="🌍 Translate To",
|
128 |
+
value="ko",
|
129 |
+
)
|
130 |
+
k_input = gr.Number(
|
131 |
+
label="📊 First k missing translated docs",
|
132 |
+
value=1,
|
133 |
+
minimum=1,
|
134 |
+
maximum=100,
|
135 |
+
)
|
136 |
+
find_btn = gr.Button(
|
137 |
+
"🔍 Find Files to Translate",
|
138 |
+
elem_classes="action-button",
|
139 |
+
)
|
140 |
+
|
141 |
+
with gr.TabItem("2. Translate", id=1):
|
142 |
+
with gr.Group():
|
143 |
+
translate_lang_display = gr.Dropdown(
|
144 |
+
choices=[language.value for language in Languages],
|
145 |
+
label="🌍 Translation Language",
|
146 |
+
value="ko",
|
147 |
+
interactive=False,
|
148 |
+
)
|
149 |
+
anthropic_key = gr.Textbox(
|
150 |
+
label="🔑 Anthropic API key for translation generation",
|
151 |
+
type="password",
|
152 |
+
)
|
153 |
+
start_translate_btn = gr.Button(
|
154 |
+
"🚀 Start Translation", elem_classes="action-button"
|
155 |
+
)
|
156 |
+
|
157 |
+
with gr.TabItem("3. Upload PR", id=2):
|
158 |
+
with gr.Group():
|
159 |
+
github_token = gr.Textbox(
|
160 |
+
label="🔑 GitHub Token",
|
161 |
+
type="password",
|
162 |
+
placeholder="ghp_xxxxxxxxxxxxxxxxxxxx",
|
163 |
+
)
|
164 |
+
github_owner = gr.Textbox(
|
165 |
+
label="👤 GitHub Owner/Username",
|
166 |
+
placeholder="your-username",
|
167 |
+
)
|
168 |
+
github_repo = gr.Textbox(
|
169 |
+
label="📁 Repository Name",
|
170 |
+
placeholder="your-repository",
|
171 |
+
)
|
172 |
+
reference_pr_url = gr.Textbox(
|
173 |
+
label="🔗 Reference PR URL (Optional - Agent will find one if not provided)",
|
174 |
+
placeholder="reference PR URL",
|
175 |
+
)
|
176 |
+
|
177 |
+
save_config_btn = gr.Button(
|
178 |
+
"💾 Save GitHub Config", elem_classes="action-button"
|
179 |
+
)
|
180 |
+
approve_btn = gr.Button(
|
181 |
+
"✅ Generate GitHub PR", elem_classes="action-button"
|
182 |
+
)
|
183 |
+
restart_btn = gr.Button(
|
184 |
+
"🔄 Restart Translation", elem_classes="action-button"
|
185 |
+
)
|
186 |
+
|
187 |
+
# Chat Controller
|
188 |
+
with gr.Column(elem_classes=["control-panel"]):
|
189 |
+
gr.Markdown("### 💬 Chat with agent")
|
190 |
+
msg_input = gr.Textbox(
|
191 |
+
placeholder="Type your message here... (e.g. 'what', 'how', or 'help')",
|
192 |
+
container=False,
|
193 |
+
scale=4,
|
194 |
+
)
|
195 |
+
send_btn = gr.Button("Send", scale=1, elem_classes="action-button")
|
196 |
+
|
197 |
+
# Event Handlers
|
198 |
+
|
199 |
+
find_btn.click(
|
200 |
+
fn=process_file_search_handler,
|
201 |
+
inputs=[lang_dropdown, k_input, chatbot],
|
202 |
+
outputs=[chatbot, msg_input, status_display, control_tabs],
|
203 |
+
)
|
204 |
+
|
205 |
+
# Sync language across tabs
|
206 |
+
lang_dropdown.change(
|
207 |
+
fn=sync_language_displays,
|
208 |
+
inputs=[lang_dropdown],
|
209 |
+
outputs=[translate_lang_display],
|
210 |
+
)
|
211 |
+
|
212 |
+
# Button event handlers
|
213 |
+
start_translate_btn.click(
|
214 |
+
fn=start_translate_handler,
|
215 |
+
inputs=[chatbot, anthropic_key],
|
216 |
+
outputs=[chatbot, msg_input, status_display, control_tabs],
|
217 |
+
)
|
218 |
+
|
219 |
+
# GitHub Config Save
|
220 |
+
save_config_btn.click(
|
221 |
+
fn=update_github_config,
|
222 |
+
inputs=[github_token, github_owner, github_repo, reference_pr_url],
|
223 |
+
outputs=[msg_input],
|
224 |
+
)
|
225 |
+
|
226 |
+
approve_btn.click(
|
227 |
+
fn=approve_handler,
|
228 |
+
inputs=[chatbot, github_owner, github_repo, reference_pr_url],
|
229 |
+
outputs=[chatbot, msg_input, status_display],
|
230 |
+
)
|
231 |
+
|
232 |
+
restart_btn.click(
|
233 |
+
fn=restart_handler,
|
234 |
+
inputs=[chatbot],
|
235 |
+
outputs=[chatbot, msg_input, status_display, control_tabs],
|
236 |
+
)
|
237 |
+
|
238 |
+
send_btn.click(
|
239 |
+
fn=send_message,
|
240 |
+
inputs=[msg_input, chatbot],
|
241 |
+
outputs=[chatbot, msg_input, status_display],
|
242 |
+
)
|
243 |
+
|
244 |
+
msg_input.submit(
|
245 |
+
fn=send_message,
|
246 |
+
inputs=[msg_input, chatbot],
|
247 |
+
outputs=[chatbot, msg_input, status_display],
|
248 |
+
)
|
249 |
+
|
250 |
+
root_path = os.environ.get("GRADIO_ROOT_PATH")
|
251 |
+
demo.launch(root_path=root_path)
|
example.env
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ANTHROPIC_API_KEY=<your api key>
|
2 |
+
|
3 |
+
# GitHub PR Agent Configuration
|
4 |
+
GITHUB_TOKEN=<your github token>
|
5 |
+
GITHUB_OWNER=<your github username>
|
6 |
+
GITHUB_REPO=<your repository name>
|
7 |
+
REFERENCE_PR_URL=<reference pr url for style analysis>
|
images/hfkr_logo.png
ADDED
![]() |
index.html
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!doctype html>
|
2 |
+
<html>
|
3 |
+
<head>
|
4 |
+
<meta charset="utf-8" />
|
5 |
+
<meta name="viewport" content="width=device-width" />
|
6 |
+
<title>My static Space</title>
|
7 |
+
<link rel="stylesheet" href="style.css" />
|
8 |
+
</head>
|
9 |
+
<body>
|
10 |
+
<div class="card">
|
11 |
+
<h1>Welcome to your static Space!</h1>
|
12 |
+
<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
|
13 |
+
<p>
|
14 |
+
Also don't forget to check the
|
15 |
+
<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
|
16 |
+
</p>
|
17 |
+
</div>
|
18 |
+
</body>
|
19 |
+
</html>
|
pr_generator/agent.py
ADDED
@@ -0,0 +1,593 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
GitHub PR creation agent using Langchain.
|
3 |
+
This code integrates with the actual GitHub API using the PyGithub library.
|
4 |
+
Please set the GITHUB_TOKEN environment variable and install required libraries before running.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import re
|
9 |
+
import json
|
10 |
+
from typing import Optional, Dict, List, Tuple, Any
|
11 |
+
|
12 |
+
# Load environment variables from .env file
|
13 |
+
from dotenv import load_dotenv
|
14 |
+
|
15 |
+
load_dotenv()
|
16 |
+
|
17 |
+
# Constants definition
|
18 |
+
ANTHROPIC_MODEL_ID = "claude-sonnet-4-20250514"
|
19 |
+
DEFAULT_TEMPERATURE = 0.0
|
20 |
+
|
21 |
+
# Library imports and error handling
|
22 |
+
try:
|
23 |
+
from github import Github, GithubException
|
24 |
+
from github.GitRef import GitRef
|
25 |
+
from langchain_anthropic import ChatAnthropic
|
26 |
+
|
27 |
+
REQUIRED_LIBS_AVAILABLE = True
|
28 |
+
except ImportError as e:
|
29 |
+
print(f"Required libraries are not installed: {e}")
|
30 |
+
print("Please run: pip install PyGithub boto3 langchain-anthropic")
|
31 |
+
REQUIRED_LIBS_AVAILABLE = False
|
32 |
+
|
33 |
+
|
34 |
+
class GitHubPRAgent:
|
35 |
+
"""Agent class for GitHub PR creation"""
|
36 |
+
|
37 |
+
def __init__(self):
|
38 |
+
self._github_client = None
|
39 |
+
self._llm = None
|
40 |
+
|
41 |
+
@property
|
42 |
+
def github_client(self) -> Optional[Github]:
|
43 |
+
"""Return GitHub API client with lazy initialization."""
|
44 |
+
if not REQUIRED_LIBS_AVAILABLE:
|
45 |
+
raise ImportError("Required libraries not found.")
|
46 |
+
|
47 |
+
if self._github_client is None:
|
48 |
+
token = os.environ.get("GITHUB_TOKEN")
|
49 |
+
if not token:
|
50 |
+
print("Warning: GITHUB_TOKEN environment variable not set.")
|
51 |
+
return Github() # Limited access
|
52 |
+
self._github_client = Github(token)
|
53 |
+
|
54 |
+
return self._github_client
|
55 |
+
|
56 |
+
@property
|
57 |
+
def llm(self):
|
58 |
+
"""Return LLM client with lazy initialization."""
|
59 |
+
if not REQUIRED_LIBS_AVAILABLE:
|
60 |
+
raise ImportError("Required libraries not found.")
|
61 |
+
|
62 |
+
if self._llm is None:
|
63 |
+
self._llm = ChatAnthropic(
|
64 |
+
model=ANTHROPIC_MODEL_ID,
|
65 |
+
temperature=DEFAULT_TEMPERATURE,
|
66 |
+
)
|
67 |
+
return self._llm
|
68 |
+
|
69 |
+
def _handle_github_error(self, e: Exception, operation: str) -> str:
|
70 |
+
"""Handle GitHub API errors consistently."""
|
71 |
+
if isinstance(e, GithubException):
|
72 |
+
return f"{operation} failed: {e.status} {e.data.get('message', e.data)}"
|
73 |
+
return f"Unexpected error during {operation}: {str(e)}"
|
74 |
+
|
75 |
+
def create_pull_request(
|
76 |
+
self,
|
77 |
+
owner: str,
|
78 |
+
repo_name: str,
|
79 |
+
title: str,
|
80 |
+
head: str,
|
81 |
+
base: str,
|
82 |
+
body: str = "",
|
83 |
+
draft: bool = False,
|
84 |
+
maintainer_can_modify: bool = True,
|
85 |
+
) -> str:
|
86 |
+
"""Create a new Pull Request."""
|
87 |
+
try:
|
88 |
+
# 1. Check if head and base are the same
|
89 |
+
if head == base:
|
90 |
+
return f"ERROR: head branch ({head}) and base branch ({base}) are identical."
|
91 |
+
|
92 |
+
# 2. Check for existing PR
|
93 |
+
existing_pr = self.check_existing_pr(owner, repo_name, head, base)
|
94 |
+
if existing_pr:
|
95 |
+
return f"ERROR: {existing_pr}"
|
96 |
+
|
97 |
+
# 3. Verify head branch exists
|
98 |
+
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
99 |
+
try:
|
100 |
+
head_branch = repo.get_branch(head)
|
101 |
+
base_branch = repo.get_branch(base)
|
102 |
+
|
103 |
+
# 4. Check if head and base branches point to the same commit
|
104 |
+
if head_branch.commit.sha == base_branch.commit.sha:
|
105 |
+
return f"ERROR: head branch ({head}) and base branch ({base}) point to the same commit. No changes to merge."
|
106 |
+
|
107 |
+
except GithubException as e:
|
108 |
+
if e.status == 404:
|
109 |
+
return f"ERROR: Branch not found. head: {head}, base: {base}"
|
110 |
+
|
111 |
+
# 5. Create PR
|
112 |
+
pr = repo.create_pull(
|
113 |
+
title=title,
|
114 |
+
body=body,
|
115 |
+
head=head,
|
116 |
+
base=base,
|
117 |
+
draft=draft,
|
118 |
+
maintainer_can_modify=maintainer_can_modify,
|
119 |
+
)
|
120 |
+
return f"PR creation successful: {pr.html_url}"
|
121 |
+
except GithubException as e:
|
122 |
+
if e.status == 422:
|
123 |
+
error_msg = e.data.get("message", "Unknown error")
|
124 |
+
errors = e.data.get("errors", [])
|
125 |
+
|
126 |
+
error_details = []
|
127 |
+
for error in errors:
|
128 |
+
if "message" in error:
|
129 |
+
error_details.append(error["message"])
|
130 |
+
|
131 |
+
detail_msg = " | ".join(error_details) if error_details else ""
|
132 |
+
return f"ERROR: PR creation failed (422): {error_msg}. {detail_msg}"
|
133 |
+
return self._handle_github_error(e, "PR creation")
|
134 |
+
except Exception as e:
|
135 |
+
return self._handle_github_error(e, "PR creation")
|
136 |
+
|
137 |
+
def create_branch(
|
138 |
+
self, owner: str, repo_name: str, branch_name: str, source_sha: str
|
139 |
+
) -> str:
|
140 |
+
"""Create a new branch."""
|
141 |
+
try:
|
142 |
+
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
143 |
+
ref_name = f"refs/heads/{branch_name}"
|
144 |
+
new_ref = repo.create_git_ref(ref=ref_name, sha=source_sha)
|
145 |
+
|
146 |
+
if isinstance(new_ref, GitRef):
|
147 |
+
return f"SUCCESS: Branch '{branch_name}' created successfully (ref: {new_ref.ref})"
|
148 |
+
return f"ERROR: Branch '{branch_name}' creation failed. Please check API response."
|
149 |
+
except GithubException as e:
|
150 |
+
if e.status == 422 and "Reference already exists" in str(e.data):
|
151 |
+
return f"WARNING: Branch '{branch_name}' already exists."
|
152 |
+
return self._handle_github_error(e, "branch creation")
|
153 |
+
except Exception as e:
|
154 |
+
return self._handle_github_error(e, "branch creation")
|
155 |
+
|
156 |
+
def check_existing_pr(
|
157 |
+
self, owner: str, repo_name: str, head: str, base: str
|
158 |
+
) -> Optional[str]:
|
159 |
+
"""Check if there's an existing PR with the same head and base."""
|
160 |
+
try:
|
161 |
+
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
162 |
+
pulls = repo.get_pulls(state="open", head=f"{owner}:{head}", base=base)
|
163 |
+
for pr in pulls:
|
164 |
+
return f"Existing PR found: {pr.html_url}"
|
165 |
+
return None
|
166 |
+
except Exception as e:
|
167 |
+
print(f"⚠️ Error checking existing PR: {str(e)}")
|
168 |
+
return None
|
169 |
+
|
170 |
+
def create_or_update_file(
|
171 |
+
self,
|
172 |
+
owner: str,
|
173 |
+
repo_name: str,
|
174 |
+
path: str,
|
175 |
+
message: str,
|
176 |
+
content: str,
|
177 |
+
branch_name: Optional[str] = None,
|
178 |
+
sha_blob: Optional[str] = None,
|
179 |
+
) -> str:
|
180 |
+
"""Create or update a single file."""
|
181 |
+
try:
|
182 |
+
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
183 |
+
|
184 |
+
args = {
|
185 |
+
"path": path,
|
186 |
+
"message": message,
|
187 |
+
"content": content,
|
188 |
+
}
|
189 |
+
if branch_name:
|
190 |
+
args["branch"] = branch_name
|
191 |
+
|
192 |
+
# Try to update file
|
193 |
+
if sha_blob:
|
194 |
+
args["sha"] = sha_blob
|
195 |
+
repo.update_file(**args)
|
196 |
+
return f"SUCCESS: File updated - {path}"
|
197 |
+
|
198 |
+
# Try to create file
|
199 |
+
repo.create_file(**args)
|
200 |
+
return f"SUCCESS: File created - {path}"
|
201 |
+
|
202 |
+
except GithubException as e:
|
203 |
+
# Try to update if file already exists
|
204 |
+
if e.status == 422:
|
205 |
+
try:
|
206 |
+
existing_file = repo.get_contents(
|
207 |
+
path, ref=branch_name or repo.default_branch
|
208 |
+
)
|
209 |
+
args["sha"] = existing_file.sha
|
210 |
+
repo.update_file(**args)
|
211 |
+
return f"SUCCESS: File updated - {path}"
|
212 |
+
except:
|
213 |
+
pass
|
214 |
+
return f"ERROR: File processing failed - {path}"
|
215 |
+
except Exception:
|
216 |
+
return f"ERROR: File processing failed - {path}"
|
217 |
+
|
218 |
+
def analyze_reference_pr(self, pr_url: str) -> Dict[str, Any]:
|
219 |
+
"""Analyze reference PR to extract style information."""
|
220 |
+
try:
|
221 |
+
# Parse PR URL
|
222 |
+
match = re.match(r"https://github\.com/([^/]+)/([^/]+)/pull/(\d+)", pr_url)
|
223 |
+
if not match:
|
224 |
+
return {"error": f"Invalid PR URL format: {pr_url}"}
|
225 |
+
|
226 |
+
owner, repo_name, pr_number = match.groups()
|
227 |
+
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
228 |
+
pr = repo.get_pull(int(pr_number))
|
229 |
+
|
230 |
+
return {
|
231 |
+
"title": pr.title,
|
232 |
+
"body": pr.body,
|
233 |
+
"head_branch": pr.head.ref,
|
234 |
+
"base_branch": pr.base.ref,
|
235 |
+
"files_changed": [f.filename for f in pr.get_files()],
|
236 |
+
"commits": [
|
237 |
+
{"message": c.commit.message, "sha": c.sha}
|
238 |
+
for c in pr.get_commits()
|
239 |
+
],
|
240 |
+
}
|
241 |
+
except Exception as e:
|
242 |
+
return {"error": f"Error occurred during PR analysis: {str(e)}"}
|
243 |
+
|
244 |
+
def _generate_with_llm(
|
245 |
+
self, prompt: str, fallback_value: str, operation: str
|
246 |
+
) -> str:
|
247 |
+
"""Generate text using LLM."""
|
248 |
+
try:
|
249 |
+
response = self.llm.invoke(prompt)
|
250 |
+
generated = response.content.strip()
|
251 |
+
print(f"LLM generated {operation}: {generated}")
|
252 |
+
return generated
|
253 |
+
except Exception as e:
|
254 |
+
print(f"❌ Error generating {operation} with LLM: {e}")
|
255 |
+
print(f"Using fallback value: {fallback_value}")
|
256 |
+
return fallback_value
|
257 |
+
|
258 |
+
def generate_branch_name_from_reference(
|
259 |
+
self, reference_branch_name: str, target_language: str, file_name: str
|
260 |
+
) -> str:
|
261 |
+
"""Use LLM to analyze reference PR information and generate appropriate branch name."""
|
262 |
+
prompt = f"""Here is the reference PR information:
|
263 |
+
|
264 |
+
Reference PR branch name: {reference_branch_name}
|
265 |
+
|
266 |
+
Now I need to generate a branch name for a new translation task:
|
267 |
+
- Target language: {target_language}
|
268 |
+
- File to translate: {file_name}
|
269 |
+
|
270 |
+
Please analyze the pattern and style of the reference PR title to generate a consistent new branch name.
|
271 |
+
|
272 |
+
Requirements:
|
273 |
+
1. Follow the naming conventions and patterns of the reference PR
|
274 |
+
2. Appropriately reflect the target language ({target_language}) and file name ({file_name}) if applicable
|
275 |
+
|
276 |
+
Please return only the branch name. No other explanation is needed."""
|
277 |
+
|
278 |
+
fallback = f"translate-{target_language}-{file_name.replace('_', '-')}"
|
279 |
+
return self._generate_with_llm(prompt, fallback, "branch name")
|
280 |
+
|
281 |
+
def generate_pr_content_from_reference(
|
282 |
+
self,
|
283 |
+
reference_title: str,
|
284 |
+
reference_body: str,
|
285 |
+
target_language: str,
|
286 |
+
filepath: str,
|
287 |
+
target_filepath: str,
|
288 |
+
file_name: str,
|
289 |
+
) -> Tuple[str, str]:
|
290 |
+
"""Use LLM to analyze reference PR title and body and generate appropriate PR content."""
|
291 |
+
prompt = f"""Here is the reference PR information:
|
292 |
+
|
293 |
+
Reference PR title: {reference_title}
|
294 |
+
|
295 |
+
Reference PR body:
|
296 |
+
{reference_body}
|
297 |
+
|
298 |
+
Now I need to generate PR title and body for a new translation task:
|
299 |
+
- Target language: {target_language}
|
300 |
+
- Original file: {filepath}
|
301 |
+
- Translation file: {target_filepath}
|
302 |
+
- File name: {file_name}
|
303 |
+
|
304 |
+
Please analyze the style and format of the reference PR to generate consistent new PR title and body.
|
305 |
+
|
306 |
+
Requirements:
|
307 |
+
1. Follow the title format and pattern of the reference PR
|
308 |
+
2. Maintain the body style, markdown format, indentation, and line breaks of the reference PR
|
309 |
+
3. Appropriately reflect the target language ({target_language}) and file paths
|
310 |
+
4. If there are user mentions (@username), change them to general text instead of actual mentions
|
311 |
+
5. Adjust the content to fit the translation task
|
312 |
+
|
313 |
+
Response format:
|
314 |
+
Title: [PR title here]
|
315 |
+
Body: [PR body here, maintaining the exact markdown format and structure of the original]"""
|
316 |
+
|
317 |
+
try:
|
318 |
+
response = self.llm.invoke(prompt)
|
319 |
+
generated_content = response.content.strip()
|
320 |
+
|
321 |
+
# Separate title and body from response
|
322 |
+
lines = generated_content.split("\n")
|
323 |
+
title_line = ""
|
324 |
+
body_lines = []
|
325 |
+
parsing_body = False
|
326 |
+
|
327 |
+
for line in lines:
|
328 |
+
if line.startswith("Title:"):
|
329 |
+
title_line = line.replace("Title:", "").strip()
|
330 |
+
elif line.startswith("Body:"):
|
331 |
+
parsing_body = True
|
332 |
+
body_content = line.replace("Body:", "").strip()
|
333 |
+
if body_content:
|
334 |
+
body_lines.append(body_content)
|
335 |
+
elif parsing_body:
|
336 |
+
body_lines.append(line)
|
337 |
+
|
338 |
+
generated_title = title_line if title_line else reference_title
|
339 |
+
generated_body = (
|
340 |
+
"\n".join(body_lines)
|
341 |
+
if body_lines
|
342 |
+
else f"Add {target_language} translation for `{filepath}`."
|
343 |
+
)
|
344 |
+
|
345 |
+
print(f"LLM generated PR title: {generated_title}")
|
346 |
+
print(f"LLM generated PR body (first 100 chars): {generated_body[:100]}...")
|
347 |
+
|
348 |
+
return generated_title, generated_body
|
349 |
+
|
350 |
+
except Exception as e:
|
351 |
+
print(f"❌ Error generating PR content with LLM: {e}")
|
352 |
+
return self._generate_default_pr_content(
|
353 |
+
target_language, filepath, target_filepath, file_name
|
354 |
+
)
|
355 |
+
|
356 |
+
def _generate_default_pr_content(
|
357 |
+
self, target_language: str, filepath: str, target_filepath: str, file_name: str
|
358 |
+
) -> Tuple[str, str]:
|
359 |
+
"""Generate default PR content."""
|
360 |
+
title = f"[i18n-{target_language}] Add {target_language} translation for {file_name}"
|
361 |
+
body = f"""## Summary
|
362 |
+
Add {target_language} translation for `{filepath}`.
|
363 |
+
|
364 |
+
## Changes
|
365 |
+
- Add {target_language} translation: `{target_filepath}`
|
366 |
+
- Original file: `{filepath}`
|
367 |
+
"""
|
368 |
+
return title, body
|
369 |
+
|
370 |
+
def generate_commit_message_from_reference(
|
371 |
+
self, commit_messages: List[str], target_language: str, file_name: str
|
372 |
+
) -> str:
|
373 |
+
"""Use LLM to analyze reference PR commit messages and generate appropriate commit message."""
|
374 |
+
commits_text = (
|
375 |
+
"\n".join([f"- {msg}" for msg in commit_messages])
|
376 |
+
if commit_messages
|
377 |
+
else "None"
|
378 |
+
)
|
379 |
+
|
380 |
+
prompt = f"""Here are the commit messages from the reference PR:
|
381 |
+
|
382 |
+
{commits_text}
|
383 |
+
|
384 |
+
Now I need to generate a commit message for a new translation task:
|
385 |
+
- Target language: {target_language}
|
386 |
+
- File to translate: {file_name}
|
387 |
+
|
388 |
+
Please analyze the commit message patterns and style of the reference PR to generate a consistent new commit message.
|
389 |
+
|
390 |
+
Requirements:
|
391 |
+
1. Follow the commit message style and format of the reference PR
|
392 |
+
2. Appropriately reflect the target language ({target_language}) and file name ({file_name})
|
393 |
+
3. Follow general Git commit message conventions
|
394 |
+
4. Be concise and clear
|
395 |
+
5. If you detect typos in the given commit messages, use corrected versions (e.g., dos -> docs)
|
396 |
+
|
397 |
+
Please return only the commit message. No other explanation is needed."""
|
398 |
+
|
399 |
+
fallback = f"docs: add {target_language} translation for {file_name}"
|
400 |
+
return self._generate_with_llm(prompt, fallback, "commit message")
|
401 |
+
|
402 |
+
def get_branch_info(self, owner: str, repo_name: str, branch_name: str) -> str:
|
403 |
+
"""Get information about an existing branch."""
|
404 |
+
try:
|
405 |
+
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
406 |
+
branch = repo.get_branch(branch_name)
|
407 |
+
commit = branch.commit
|
408 |
+
commit_info = commit.commit
|
409 |
+
|
410 |
+
return f"""
|
411 |
+
📋 Existing branch information:
|
412 |
+
- Branch name: {branch_name}
|
413 |
+
- Latest commit: {commit.sha[:8]}
|
414 |
+
- Commit message: {commit_info.message.split(chr(10))[0][:80]}...
|
415 |
+
- Author: {commit_info.author.name}
|
416 |
+
- Date: {commit_info.author.date.strftime('%Y-%m-%d %H:%M:%S')}
|
417 |
+
"""
|
418 |
+
except Exception as e:
|
419 |
+
return f"Failed to retrieve branch information: {str(e)}"
|
420 |
+
|
421 |
+
def run_translation_pr_workflow(
|
422 |
+
self,
|
423 |
+
reference_pr_url: str,
|
424 |
+
target_language: str,
|
425 |
+
filepath: str,
|
426 |
+
translated_doc: str,
|
427 |
+
owner: str,
|
428 |
+
repo_name: str,
|
429 |
+
base_branch: str = "main",
|
430 |
+
) -> Dict[str, Any]:
|
431 |
+
"""Execute translation document PR creation workflow."""
|
432 |
+
try:
|
433 |
+
# 1. Analyze reference PR
|
434 |
+
print(f"🔍 Analyzing reference PR: {reference_pr_url}")
|
435 |
+
pr_analysis = self.analyze_reference_pr(reference_pr_url)
|
436 |
+
|
437 |
+
if "error" in pr_analysis:
|
438 |
+
return {"status": "error", "message": pr_analysis["error"]}
|
439 |
+
|
440 |
+
print("Reference PR analysis completed")
|
441 |
+
|
442 |
+
# 2. Generate translation file path and branch name
|
443 |
+
target_filepath = filepath.replace("/en/", f"/{target_language}/")
|
444 |
+
file_name = filepath.split("/")[-1].replace(".md", "")
|
445 |
+
|
446 |
+
print(f"🌿 Generating branch name...")
|
447 |
+
branch_name = self.generate_branch_name_from_reference(
|
448 |
+
pr_analysis["head_branch"], target_language, file_name
|
449 |
+
)
|
450 |
+
|
451 |
+
# 3. Get main branch SHA and create branch
|
452 |
+
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
453 |
+
main_branch = repo.get_branch(base_branch)
|
454 |
+
main_sha = main_branch.commit.sha
|
455 |
+
|
456 |
+
print(f"🌿 Creating branch: {branch_name}")
|
457 |
+
branch_result = self.create_branch(owner, repo_name, branch_name, main_sha)
|
458 |
+
|
459 |
+
# Check branch creation result
|
460 |
+
if branch_result.startswith("ERROR"):
|
461 |
+
return {
|
462 |
+
"status": "error",
|
463 |
+
"message": f"Branch creation failed: {branch_result}",
|
464 |
+
"branch": branch_name,
|
465 |
+
}
|
466 |
+
elif branch_result.startswith("WARNING"):
|
467 |
+
print(f"⚠️ {branch_result}")
|
468 |
+
# Continue if branch already exists
|
469 |
+
else:
|
470 |
+
print(f"{branch_result}")
|
471 |
+
|
472 |
+
# 4. Generate commit message and save file
|
473 |
+
commit_messages = [commit["message"] for commit in pr_analysis["commits"]]
|
474 |
+
commit_message = self.generate_commit_message_from_reference(
|
475 |
+
commit_messages, target_language, file_name
|
476 |
+
)
|
477 |
+
|
478 |
+
print(f"📄 Saving file: {target_filepath}")
|
479 |
+
file_result = self.create_or_update_file(
|
480 |
+
owner,
|
481 |
+
repo_name,
|
482 |
+
target_filepath,
|
483 |
+
commit_message,
|
484 |
+
translated_doc,
|
485 |
+
branch_name,
|
486 |
+
)
|
487 |
+
|
488 |
+
if not file_result.startswith("SUCCESS"):
|
489 |
+
return {
|
490 |
+
"status": "error",
|
491 |
+
"message": "An issue occurred while saving the file.",
|
492 |
+
"branch": branch_name,
|
493 |
+
"file_path": target_filepath,
|
494 |
+
}
|
495 |
+
|
496 |
+
print(f"{file_result}")
|
497 |
+
|
498 |
+
# 5. Create PR
|
499 |
+
pr_title, pr_body = self.generate_pr_content_from_reference(
|
500 |
+
pr_analysis["title"],
|
501 |
+
pr_analysis["body"],
|
502 |
+
target_language,
|
503 |
+
filepath,
|
504 |
+
target_filepath,
|
505 |
+
file_name,
|
506 |
+
)
|
507 |
+
|
508 |
+
print(f"🔄 Creating PR: {pr_title}")
|
509 |
+
print(f" Head: {branch_name} → Base: {base_branch}")
|
510 |
+
|
511 |
+
pr_result = self.create_pull_request(
|
512 |
+
owner, repo_name, pr_title, branch_name, base_branch, pr_body
|
513 |
+
)
|
514 |
+
|
515 |
+
if pr_result.startswith("ERROR"):
|
516 |
+
print(f"❌ {pr_result}")
|
517 |
+
return {
|
518 |
+
"status": "partial_success",
|
519 |
+
"branch": branch_name,
|
520 |
+
"file_path": target_filepath,
|
521 |
+
"message": f"File was saved but PR creation failed: {pr_result}",
|
522 |
+
"error_details": pr_result,
|
523 |
+
}
|
524 |
+
elif "successful" in pr_result and "http" in pr_result:
|
525 |
+
print(f"{pr_result}")
|
526 |
+
return {
|
527 |
+
"status": "success",
|
528 |
+
"branch": branch_name,
|
529 |
+
"file_path": target_filepath,
|
530 |
+
"pr_url": pr_result.split(": ")[-1],
|
531 |
+
"message": "Translation document PR created successfully!",
|
532 |
+
}
|
533 |
+
else:
|
534 |
+
return {
|
535 |
+
"status": "partial_success",
|
536 |
+
"branch": branch_name,
|
537 |
+
"file_path": target_filepath,
|
538 |
+
"message": "File was saved but PR creation failed.",
|
539 |
+
}
|
540 |
+
|
541 |
+
except Exception as e:
|
542 |
+
return {
|
543 |
+
"status": "error",
|
544 |
+
"message": f"Error occurred during workflow execution: {str(e)}",
|
545 |
+
}
|
546 |
+
|
547 |
+
|
548 |
+
# Backward compatibility functions (maintain compatibility with existing code)
|
549 |
+
_agent = GitHubPRAgent()
|
550 |
+
|
551 |
+
|
552 |
+
def get_github_client():
|
553 |
+
return _agent.github_client
|
554 |
+
|
555 |
+
|
556 |
+
def create_pull_request_func(*args, **kwargs):
|
557 |
+
return _agent.create_pull_request(*args, **kwargs)
|
558 |
+
|
559 |
+
|
560 |
+
def create_branch_func(*args, **kwargs):
|
561 |
+
return _agent.create_branch(*args, **kwargs)
|
562 |
+
|
563 |
+
|
564 |
+
def create_or_update_file_func(*args, **kwargs):
|
565 |
+
return _agent.create_or_update_file(*args, **kwargs)
|
566 |
+
|
567 |
+
|
568 |
+
def analyze_reference_pr_func(*args, **kwargs):
|
569 |
+
return _agent.analyze_reference_pr(*args, **kwargs)
|
570 |
+
|
571 |
+
|
572 |
+
def generate_branch_name_from_reference(*args, **kwargs):
|
573 |
+
return _agent.generate_branch_name_from_reference(*args, **kwargs)
|
574 |
+
|
575 |
+
|
576 |
+
def generate_pr_content_from_reference(*args, **kwargs):
|
577 |
+
return _agent.generate_pr_content_from_reference(*args, **kwargs)
|
578 |
+
|
579 |
+
|
580 |
+
def generate_default_pr_content(*args, **kwargs):
|
581 |
+
return _agent._generate_default_pr_content(*args, **kwargs)
|
582 |
+
|
583 |
+
|
584 |
+
def generate_commit_message_from_reference(*args, **kwargs):
|
585 |
+
return _agent.generate_commit_message_from_reference(*args, **kwargs)
|
586 |
+
|
587 |
+
|
588 |
+
def get_branch_info(*args, **kwargs):
|
589 |
+
return _agent.get_branch_info(*args, **kwargs)
|
590 |
+
|
591 |
+
|
592 |
+
def run_translation_pr_agent_simple(*args, **kwargs):
|
593 |
+
return _agent.run_translation_pr_workflow(*args, **kwargs)
|
pr_generator/searcher.py
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
GitHub PR Search Agent
|
3 |
+
An agent that finds a suitable reference PR when a reference PR URL is not provided.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import re
|
8 |
+
import logging
|
9 |
+
from typing import List, Dict, Any, Optional
|
10 |
+
|
11 |
+
# Load environment variables
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
|
14 |
+
load_dotenv()
|
15 |
+
|
16 |
+
# Setup logging
|
17 |
+
logging.basicConfig(
|
18 |
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
19 |
+
)
|
20 |
+
logger = logging.getLogger(__name__)
|
21 |
+
|
22 |
+
# Langchain imports
|
23 |
+
try:
|
24 |
+
from langchain_anthropic import ChatAnthropic
|
25 |
+
from langchain.tools import StructuredTool
|
26 |
+
from langchain.agents import AgentExecutor, create_tool_calling_agent
|
27 |
+
from langchain_core.prompts import ChatPromptTemplate
|
28 |
+
from github import Github
|
29 |
+
|
30 |
+
REQUIRED_LIBS_AVAILABLE = True
|
31 |
+
except ImportError as e:
|
32 |
+
print(f"Required libraries are not installed: {e}")
|
33 |
+
REQUIRED_LIBS_AVAILABLE = False
|
34 |
+
|
35 |
+
# Constants
|
36 |
+
ANTHROPIC_MODEL_ID = "claude-sonnet-4-20250514"
|
37 |
+
DEFAULT_TEMPERATURE = 0.0
|
38 |
+
# Fallback PR URL to ensure a PR is always returned
|
39 |
+
DEFAULT_FALLBACK_PR_URL = "https://github.com/huggingface/transformers/pull/24968"
|
40 |
+
|
41 |
+
|
42 |
+
class GitHubPRSearcher:
|
43 |
+
"""GitHub PR Searcher - now using a LangChain agent."""
|
44 |
+
|
45 |
+
def _search_github_prs(self, query: str) -> List[Dict[str, Any]]:
|
46 |
+
"""
|
47 |
+
Searches GitHub for pull requests matching the query and returns the top 5 results.
|
48 |
+
The query should be a valid GitHub search query.
|
49 |
+
"""
|
50 |
+
logger.info(f"Executing GitHub search with query: {query}")
|
51 |
+
try:
|
52 |
+
issues = self.github_client.search_issues(query=query)
|
53 |
+
# Take top 5 to keep context small for the agent
|
54 |
+
top_issues = issues.get_page(0)[:5]
|
55 |
+
|
56 |
+
if not top_issues:
|
57 |
+
return []
|
58 |
+
|
59 |
+
return [
|
60 |
+
{"title": issue.title, "url": issue.html_url, "number": issue.number}
|
61 |
+
for issue in top_issues
|
62 |
+
]
|
63 |
+
except Exception as e:
|
64 |
+
logger.error(f"Error during GitHub search: {e}", exc_info=True)
|
65 |
+
# Return an error message that the agent can understand
|
66 |
+
return [{"error": f"An error occurred during search: {e}"}]
|
67 |
+
|
68 |
+
def __init__(self):
|
69 |
+
if not REQUIRED_LIBS_AVAILABLE:
|
70 |
+
raise ImportError("Required libraries for agent could not be found.")
|
71 |
+
|
72 |
+
self._github_client = None
|
73 |
+
self.llm = ChatAnthropic(
|
74 |
+
model=ANTHROPIC_MODEL_ID,
|
75 |
+
temperature=DEFAULT_TEMPERATURE,
|
76 |
+
)
|
77 |
+
|
78 |
+
search_tool = StructuredTool.from_function(
|
79 |
+
func=self._search_github_prs,
|
80 |
+
name="search_github_prs",
|
81 |
+
description="Searches GitHub for pull requests matching the query and returns the top 5 results. The query should be a valid GitHub search query.",
|
82 |
+
)
|
83 |
+
tools = [search_tool]
|
84 |
+
|
85 |
+
prompt_string = """You are a GitHub expert. Your mission is to find the best reference pull request (PR) for a given task.
|
86 |
+
|
87 |
+
You need to find a merged PR in the repository: {owner}/{repo_name}.
|
88 |
+
The PR should be for a documentation translation into **{target_language}**.
|
89 |
+
The context for the translation is: **{context}**.
|
90 |
+
|
91 |
+
Use the tools at your disposal to search for relevant PRs.
|
92 |
+
Analyze the search results and select the one that best matches the request. A good PR is usually one that has "translation", "docs", "i18n", and the target language in its title.
|
93 |
+
|
94 |
+
Here is an example of a good search query you could use:
|
95 |
+
`repo:{owner}/{repo_name} is:pr is:merged "{target_language}" "{context}" i18n translation docs`
|
96 |
+
|
97 |
+
After your analysis, you MUST output **only the final URL** of the best PR you have chosen. Do not include any other text in your final response."""
|
98 |
+
|
99 |
+
prompt = ChatPromptTemplate.from_messages(
|
100 |
+
[
|
101 |
+
("system", prompt_string),
|
102 |
+
(
|
103 |
+
"human",
|
104 |
+
"Find the best reference PR for translating docs to {target_language} about {context} in the {owner}/{repo_name} repository.",
|
105 |
+
),
|
106 |
+
("placeholder", "{agent_scratchpad}"),
|
107 |
+
]
|
108 |
+
)
|
109 |
+
|
110 |
+
agent = create_tool_calling_agent(self.llm, tools, prompt)
|
111 |
+
self.agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)
|
112 |
+
|
113 |
+
@property
|
114 |
+
def github_client(self) -> Optional[Github]:
|
115 |
+
"""Lazy initialization of the GitHub API client."""
|
116 |
+
if not REQUIRED_LIBS_AVAILABLE:
|
117 |
+
raise ImportError("Required libraries could not be found.")
|
118 |
+
|
119 |
+
if self._github_client is None:
|
120 |
+
token = os.environ.get("GITHUB_TOKEN")
|
121 |
+
if not token:
|
122 |
+
print("Warning: GITHUB_TOKEN environment variable is not set.")
|
123 |
+
self._github_client = Github() # Limited access
|
124 |
+
else:
|
125 |
+
self._github_client = Github(token)
|
126 |
+
return self._github_client
|
127 |
+
|
128 |
+
def find_best_reference_pr(
|
129 |
+
self, owner: str, repo_name: str, target_language: str, context: str
|
130 |
+
):
|
131 |
+
"""
|
132 |
+
Finds the best reference PR using a LangChain agent.
|
133 |
+
Yields progress and returns the final PR URL.
|
134 |
+
"""
|
135 |
+
message = "🤖 Agent is searching for the best reference PR..."
|
136 |
+
logger.info(message)
|
137 |
+
yield message
|
138 |
+
|
139 |
+
try:
|
140 |
+
agent_input = {
|
141 |
+
"owner": owner,
|
142 |
+
"repo_name": repo_name,
|
143 |
+
"target_language": target_language,
|
144 |
+
"context": context,
|
145 |
+
}
|
146 |
+
|
147 |
+
agent_output = None
|
148 |
+
for event in self.agent_executor.stream(agent_input):
|
149 |
+
if "actions" in event and event["actions"]:
|
150 |
+
action = event["actions"][0]
|
151 |
+
tool_query = action.tool_input.get("query", str(action.tool_input))
|
152 |
+
message = f"🔍 Agent is using tool `{action.tool}` with query:\n`{tool_query}`"
|
153 |
+
logger.info(message)
|
154 |
+
yield message
|
155 |
+
elif "steps" in event and event["steps"]:
|
156 |
+
message = "📊 Agent is analyzing the results from the tool..."
|
157 |
+
logger.info(message)
|
158 |
+
yield message
|
159 |
+
elif "output" in event and event["output"]:
|
160 |
+
agent_output = event["output"]
|
161 |
+
|
162 |
+
if not agent_output:
|
163 |
+
message = "⚠️ Agent failed to find a suitable PR. Using default PR."
|
164 |
+
logger.warning(message)
|
165 |
+
yield message
|
166 |
+
return DEFAULT_FALLBACK_PR_URL
|
167 |
+
|
168 |
+
# The agent's final output can be a string, a list of tool results,
|
169 |
+
# or a list of content blocks from the LLM. We'll find the URL
|
170 |
+
# by searching for it in the string representation of the output.
|
171 |
+
output_text = str(agent_output)
|
172 |
+
urls = re.findall(r"https?://github.com/[^/]+/[^/]+/pull/\d+", output_text)
|
173 |
+
|
174 |
+
final_url = ""
|
175 |
+
if urls:
|
176 |
+
final_url = urls[-1] # Take the last URL found
|
177 |
+
|
178 |
+
if not final_url:
|
179 |
+
message = f"⚠️ Agent returned unparsable output: {agent_output}. Using default PR."
|
180 |
+
logger.warning(message)
|
181 |
+
yield message
|
182 |
+
return DEFAULT_FALLBACK_PR_URL
|
183 |
+
|
184 |
+
message = f"✅ Selected the best PR:\n`{final_url}`"
|
185 |
+
logger.info(f"Selected the best PR: {final_url}")
|
186 |
+
yield message
|
187 |
+
return final_url
|
188 |
+
|
189 |
+
except Exception as e:
|
190 |
+
message = f"❌ Error during agent execution: {e}\nUsing default PR."
|
191 |
+
logger.error(message, exc_info=True)
|
192 |
+
yield message
|
193 |
+
return DEFAULT_FALLBACK_PR_URL
|
194 |
+
|
195 |
+
|
196 |
+
def find_reference_pr_simple_stream(target_language: str = "", context: str = ""):
|
197 |
+
"""
|
198 |
+
A simple function to find a reference PR, streaming progress.
|
199 |
+
This function always searches in the 'huggingface/transformers' repository.
|
200 |
+
"""
|
201 |
+
searcher = GitHubPRSearcher()
|
202 |
+
stream_generator = searcher.find_best_reference_pr(
|
203 |
+
"huggingface", "transformers", target_language, context
|
204 |
+
)
|
205 |
+
# The handler will receive the final URL from the generator's return statement
|
206 |
+
final_url = yield from stream_generator
|
207 |
+
|
208 |
+
# Format the final result as expected by the handler
|
209 |
+
return {
|
210 |
+
"status": "success",
|
211 |
+
"result": f"Recommended PR URL: {final_url}",
|
212 |
+
"repository": "huggingface/transformers",
|
213 |
+
"target_language": target_language,
|
214 |
+
}
|
215 |
+
|
216 |
+
|
217 |
+
# Example usage
|
218 |
+
if __name__ == "__main__":
|
219 |
+
# Example execution for streaming
|
220 |
+
# In a real application, a generator consumer (like the one in handler.py)
|
221 |
+
# would process the yielded values. This script simulates that.
|
222 |
+
print("--- Running Streaming Search Simulation ---")
|
223 |
+
|
224 |
+
def run_simulation():
|
225 |
+
"""Simulates the consumption of the streaming generator."""
|
226 |
+
test_gen = find_reference_pr_simple_stream(
|
227 |
+
target_language="korean", context="docs"
|
228 |
+
)
|
229 |
+
try:
|
230 |
+
while True:
|
231 |
+
# This will print progress messages
|
232 |
+
print(next(test_gen))
|
233 |
+
except StopIteration as e:
|
234 |
+
# When the generator is exhausted, the final result is in e.value
|
235 |
+
print("\n--- FINAL RESULT ---")
|
236 |
+
print(e.value)
|
237 |
+
|
238 |
+
run_simulation()
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==5.33.0
|
2 |
+
requests
|
3 |
+
pydantic
|
4 |
+
langchain-anthropic
|
5 |
+
python-dotenv
|
6 |
+
langchain
|
7 |
+
PyGithub
|
8 |
+
langchain-core
|
9 |
+
langchain-community
|
style.css
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
body {
|
2 |
+
padding: 2rem;
|
3 |
+
font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
|
4 |
+
}
|
5 |
+
|
6 |
+
h1 {
|
7 |
+
font-size: 16px;
|
8 |
+
margin-top: 0;
|
9 |
+
}
|
10 |
+
|
11 |
+
p {
|
12 |
+
color: rgb(107, 114, 128);
|
13 |
+
font-size: 15px;
|
14 |
+
margin-bottom: 10px;
|
15 |
+
margin-top: 5px;
|
16 |
+
}
|
17 |
+
|
18 |
+
.card {
|
19 |
+
max-width: 620px;
|
20 |
+
margin: 0 auto;
|
21 |
+
padding: 16px;
|
22 |
+
border: 1px solid lightgray;
|
23 |
+
border-radius: 16px;
|
24 |
+
}
|
25 |
+
|
26 |
+
.card p:last-child {
|
27 |
+
margin-bottom: 0;
|
28 |
+
}
|
test/__init__.py
ADDED
File without changes
|
test/test_final_translate.md
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
2 |
+
|
3 |
+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
4 |
+
the License. You may obtain a copy of the License at
|
5 |
+
|
6 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
7 |
+
|
8 |
+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
9 |
+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
10 |
+
specific language governing permissions and limitations under the License.
|
11 |
+
|
12 |
+
⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
|
13 |
+
rendered properly in your Markdown viewer.
|
14 |
+
|
15 |
+
-->
|
16 |
+
|
17 |
+
# 가속기 선택 [[accelerator-selection]]
|
18 |
+
|
19 |
+
분산 훈련 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 개수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기들이 있을 때 더 빠른 가속기를 먼저 사용하고 싶거나, 사용 가능한 가속기 중 일부만 사용하고 싶을 때 유용합니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
|
20 |
+
|
21 |
+
이 가이드에서는 사용할 가속기의 개수와 사용 순서를 선택하는 방법을 보여드립니다.
|
22 |
+
|
23 |
+
## 가속기 개수 [[number-of-accelerators]]
|
24 |
+
|
25 |
+
예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령어를 실행하세요.
|
26 |
+
|
27 |
+
<hfoptions id="select-accelerator">
|
28 |
+
<hfoption id="torchrun">
|
29 |
+
|
30 |
+
사용할 가속기 개수를 선택하려면 `--nproc_per_node`를 사용하세요.
|
31 |
+
|
32 |
+
```bash
|
33 |
+
torchrun --nproc_per_node=2 trainer-program.py ...
|
34 |
+
```
|
35 |
+
|
36 |
+
</hfoption>
|
37 |
+
<hfoption id="Accelerate">
|
38 |
+
|
39 |
+
사용할 가속기 개수를 선택하려면 `--num_processes`를 사용하세요.
|
40 |
+
|
41 |
+
```bash
|
42 |
+
accelerate launch --num_processes 2 trainer-program.py ...
|
43 |
+
```
|
44 |
+
|
45 |
+
</hfoption>
|
46 |
+
<hfoption id="🤗 DeepSpeed">
|
47 |
+
|
48 |
+
사용할 GPU 개수를 선택하려면 `--num_gpus`를 사용하세요.
|
49 |
+
|
50 |
+
```bash
|
51 |
+
deepspeed --num_gpus 2 trainer-program.py ...
|
52 |
+
```
|
53 |
+
|
54 |
+
</hfoption>
|
55 |
+
</hfoptions>
|
56 |
+
|
57 |
+
## 가속기 순서 [[order-of-accelerators]]
|
58 |
+
사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 보통 각 실행마다 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
|
59 |
+
|
60 |
+
예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
|
61 |
+
|
62 |
+
<hfoptions id="accelerator-type">
|
63 |
+
<hfoption id="CUDA">
|
64 |
+
|
65 |
+
```bash
|
66 |
+
CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
|
67 |
+
```
|
68 |
+
|
69 |
+
GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
|
70 |
+
순서를 바꾸려면(GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
|
71 |
+
|
72 |
+
|
73 |
+
```bash
|
74 |
+
CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
|
75 |
+
```
|
76 |
+
|
77 |
+
GPU 없이 실행하려면:
|
78 |
+
|
79 |
+
```bash
|
80 |
+
CUDA_VISIBLE_DEVICES= python trainer-program.py ...
|
81 |
+
```
|
82 |
+
|
83 |
+
`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
|
84 |
+
|
85 |
+
- PCIe 버스 ID 순서(`nvidia-smi`와 일치):
|
86 |
+
|
87 |
+
```bash
|
88 |
+
$hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
|
89 |
+
```
|
90 |
+
|
91 |
+
- 컴퓨팅 성능 순서(가장 빠른 것부터):
|
92 |
+
|
93 |
+
```bash
|
94 |
+
export CUDA_DEVICE_ORDER=FASTEST_FIRST
|
95 |
+
```
|
96 |
+
|
97 |
+
</hfoption>
|
98 |
+
<hfoption id="Intel XPU">
|
99 |
+
|
100 |
+
```bash
|
101 |
+
ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
|
102 |
+
```
|
103 |
+
|
104 |
+
XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
|
105 |
+
순서를 바꾸려면(XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
|
106 |
+
|
107 |
+
```bash
|
108 |
+
ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
|
109 |
+
```
|
110 |
+
|
111 |
+
|
112 |
+
다음으로 Intel XPU의 순서를 제어할 수도 있습니다:
|
113 |
+
|
114 |
+
```bash
|
115 |
+
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
|
116 |
+
```
|
117 |
+
|
118 |
+
Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
|
119 |
+
|
120 |
+
</hfoption>
|
121 |
+
</hfoptions>
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
> [!WARNING]
|
126 |
+
> 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신 특정 훈련 실행을 위한 환경 변수를 같은 명령줄에서 설정하는 것이 일반적인 관행입니다.
|
127 |
+
|
test/test_prompt.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
output = """
|
2 |
+
What do these sentences about Hugging Face Transformers (a machine learning library) mean in Korean? Please do not translate the word after a 🤗 emoji as it is a product name. Output only the translated result without any explanations or introductions.
|
3 |
+
```md
|
4 |
+
# Accelerator selection
|
5 |
+
|
6 |
+
During distributed training, you can specify the number and order of accelerators (CUDA, XPU, MPS, HPU, etc.) to use. This can be useful when you have accelerators with different computing power and you want to use the faster accelerator first. Or you could only use a subset of the available accelerators. The selection process works for both [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html). You don't need Accelerate or [DeepSpeed integration](./main_classes/deepspeed).
|
7 |
+
|
8 |
+
This guide will show you how to select the number of accelerators to use and the order to use them in.
|
9 |
+
|
10 |
+
## Number of accelerators
|
11 |
+
|
12 |
+
For example, if there are 4 accelerators and you only want to use the first 2, run the command below.
|
13 |
+
|
14 |
+
<hfoptions id="select-accelerator">
|
15 |
+
<hfoption id="torchrun">
|
16 |
+
|
17 |
+
Use the `--nproc_per_node` to select how many accelerators to use.
|
18 |
+
|
19 |
+
</hfoption>
|
20 |
+
<hfoption id="Accelerate">
|
21 |
+
|
22 |
+
Use `--num_processes` to select how many accelerators to use.
|
23 |
+
|
24 |
+
</hfoption>
|
25 |
+
<hfoption id="DeepSpeed">
|
26 |
+
|
27 |
+
Use `--num_gpus` to select how many GPUs to use.
|
28 |
+
|
29 |
+
</hfoption>
|
30 |
+
</hfoptions>
|
31 |
+
|
32 |
+
## Order of accelerators
|
33 |
+
To select specific accelerators to use and their order, use the environment variable appropriate for your hardware. This is often set on the command line for each run, but can also be added to your `~/.bashrc` or other startup config file.
|
34 |
+
|
35 |
+
For example, if there are 4 accelerators (0, 1, 2, 3) and you only want to run accelerators 0 and 2:
|
36 |
+
|
37 |
+
<hfoptions id="accelerator-type">
|
38 |
+
<hfoption id="CUDA">
|
39 |
+
|
40 |
+
Only GPUs 0 and 2 are "visible" to PyTorch and are mapped to `cuda:0` and `cuda:1` respectively.
|
41 |
+
To reverse the order (use GPU 2 as `cuda:0` and GPU 0 as `cuda:1`):
|
42 |
+
|
43 |
+
To run without any GPUs:
|
44 |
+
|
45 |
+
You can also control the order of CUDA devices using `CUDA_DEVICE_ORDER`:
|
46 |
+
|
47 |
+
- Order by PCIe bus ID (matches `nvidia-smi`):
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
- Order by compute capability (fastest first):
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
</hfoption>
|
56 |
+
<hfoption id="Intel XPU">
|
57 |
+
|
58 |
+
Only XPUs 0 and 2 are "visible" to PyTorch and are mapped to `xpu:0` and `xpu:1` respectively.
|
59 |
+
To reverse the order (use XPU 2 as `xpu:0` and XPU 0 as `xpu:1`):
|
60 |
+
|
61 |
+
You can also control the order of Intel XPUs with:
|
62 |
+
|
63 |
+
For more information about device enumeration and sorting on Intel XPU, please refer to the [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) documentation.
|
64 |
+
|
65 |
+
</hfoption>
|
66 |
+
</hfoptions>
|
67 |
+
|
68 |
+
> [!WARNING]
|
69 |
+
> Environment variables can be exported instead of being added to the command line. This is not recommended because it can be confusing if you forget how the environment variable was set up and you end up using the wrong accelerators. Instead, it is common practice to set the environment variable for a specific training run on the same command line.
|
70 |
+
```
|
71 |
+
"""
|
test/test_translate.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
translated_content = """
|
2 |
+
# 가속기 선택
|
3 |
+
|
4 |
+
분산 훈련 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 개수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기들이 있을 때 더 빠른 가속기를 먼저 사용하고 싶거나, 사용 가능한 가속기 중 일부만 사용하고 싶을 때 유용합니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
|
5 |
+
|
6 |
+
이 가이드에서는 사용할 가속기의 개수와 사용 순서를 선택하는 방법을 보여드립니다.
|
7 |
+
|
8 |
+
## 가속기 개수
|
9 |
+
|
10 |
+
예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령어를 실행하세요.
|
11 |
+
|
12 |
+
<hfoptions id="select-accelerator">
|
13 |
+
<hfoption id="torchrun">
|
14 |
+
|
15 |
+
사용할 가속기 개수를 선택하려면 `--nproc_per_node`를 사용하세요.
|
16 |
+
|
17 |
+
</hfoption>
|
18 |
+
<hfoption id="Accelerate">
|
19 |
+
|
20 |
+
사용할 가속기 개수를 선택하려면 `--num_processes`를 사용하세요.
|
21 |
+
|
22 |
+
</hfoption>
|
23 |
+
<hfoption id="🤗 DeepSpeed">
|
24 |
+
|
25 |
+
사용할 GPU 개수를 선택하려면 `--num_gpus`를 사용하세요.
|
26 |
+
|
27 |
+
</hfoption>
|
28 |
+
</hfoptions>
|
29 |
+
|
30 |
+
## 가속기 순서
|
31 |
+
사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 보통 각 실행마다 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
|
32 |
+
|
33 |
+
예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
|
34 |
+
|
35 |
+
<hfoptions id="accelerator-type">
|
36 |
+
<hfoption id="CUDA">
|
37 |
+
|
38 |
+
GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
|
39 |
+
순서를 바꾸려면(GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
|
40 |
+
|
41 |
+
GPU 없이 실행하려면:
|
42 |
+
|
43 |
+
`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
|
44 |
+
|
45 |
+
- PCIe 버스 ID 순서(`nvidia-smi`와 일치):
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
- 컴퓨팅 성능 순서(가장 빠른 것부터):
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
</hfoption>
|
54 |
+
<hfoption id="Intel XPU">
|
55 |
+
|
56 |
+
XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
|
57 |
+
순서를 바꾸려면(XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
|
58 |
+
|
59 |
+
다음으로 Intel XPU의 순서를 제어할 수도 있습니다:
|
60 |
+
|
61 |
+
Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
|
62 |
+
|
63 |
+
</hfoption>
|
64 |
+
</hfoptions>
|
65 |
+
|
66 |
+
> [!WARNING]
|
67 |
+
> 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신 특정 훈련 실행을 위한 환경 변수를 같은 명령줄에서 설정하는 것이 일반적인 관행입니다.
|
68 |
+
"""
|
translation_result/docs/source/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
translation_result/docs/source/en/accelerator_selection.md
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
2 |
+
|
3 |
+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
4 |
+
the License. You may obtain a copy of the License at
|
5 |
+
|
6 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
7 |
+
|
8 |
+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
9 |
+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
10 |
+
specific language governing permissions and limitations under the License.
|
11 |
+
|
12 |
+
⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
|
13 |
+
rendered properly in your Markdown viewer.
|
14 |
+
|
15 |
+
-->
|
16 |
+
|
17 |
+
# 가속기 선택 [[accelerator-selection]]
|
18 |
+
|
19 |
+
분산 훈련 중에 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 수와 순서를 지정할 수 있습니다. 이는 서로 다른 연산 성능을 가진 가속기가 있고 더 빠른 가속기를 먼저 사용하고 싶을 때 유용할 수 있습니다. 또는 사용 가능한 가속기 중 일부만 사용할 수도 있습니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
|
20 |
+
|
21 |
+
이 가이드는 사용할 가속기의 수와 사용 순서를 선택하는 방법을 보여줍니다.
|
22 |
+
|
23 |
+
## 가속기 수 [[number-of-accelerators]]
|
24 |
+
|
25 |
+
예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령을 실행하세요.
|
26 |
+
|
27 |
+
<hfoptions id="select-accelerator">
|
28 |
+
<hfoption id="torchrun">
|
29 |
+
|
30 |
+
`--nproc_per_node`를 사용하여 사용할 가속기 수를 선택하세요.
|
31 |
+
|
32 |
+
```bash
|
33 |
+
torchrun --nproc_per_node=2 trainer-program.py ...
|
34 |
+
```
|
35 |
+
|
36 |
+
</hfoption>
|
37 |
+
<hfoption id="Accelerate">
|
38 |
+
|
39 |
+
`--num_processes`를 사용하여 사용할 가속기 수를 선택하세요.
|
40 |
+
|
41 |
+
```bash
|
42 |
+
accelerate launch --num_processes 2 trainer-program.py ...
|
43 |
+
```
|
44 |
+
|
45 |
+
</hfoption>
|
46 |
+
<hfoption id="DeepSpeed">
|
47 |
+
|
48 |
+
`--num_gpus`를 사용하여 사용할 GPU 수를 선택하세요.
|
49 |
+
|
50 |
+
```bash
|
51 |
+
deepspeed --num_gpus 2 trainer-program.py ...
|
52 |
+
```
|
53 |
+
|
54 |
+
</hfoption>
|
55 |
+
</hfoptions>
|
56 |
+
|
57 |
+
## 가속기 순서 [[order-of-accelerators]]
|
58 |
+
사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 각 실행마다 명령줄에서 설정되는 경우가 많지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
|
59 |
+
|
60 |
+
예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
|
61 |
+
|
62 |
+
<hfoptions id="accelerator-type">
|
63 |
+
<hfoption id="CUDA">
|
64 |
+
|
65 |
+
```bash
|
66 |
+
CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
|
67 |
+
```
|
68 |
+
|
69 |
+
GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
|
70 |
+
순서를 바꾸려면 (GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
|
71 |
+
|
72 |
+
|
73 |
+
```bash
|
74 |
+
CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
|
75 |
+
```
|
76 |
+
|
77 |
+
GPU 없이 실행하려면:
|
78 |
+
|
79 |
+
```bash
|
80 |
+
CUDA_VISIBLE_DEVICES= python trainer-program.py ...
|
81 |
+
```
|
82 |
+
|
83 |
+
`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치 순서를 제어할 수도 있습니다:
|
84 |
+
|
85 |
+
- PCIe 버스 ID 순서로 정렬 (`nvidia-smi`와 일치):
|
86 |
+
|
87 |
+
```bash
|
88 |
+
$hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
|
89 |
+
```
|
90 |
+
|
91 |
+
- 연산 성능 순서로 정렬 (가장 빠른 것부터):
|
92 |
+
|
93 |
+
```bash
|
94 |
+
export CUDA_DEVICE_ORDER=FASTEST_FIRST
|
95 |
+
```
|
96 |
+
|
97 |
+
</hfoption>
|
98 |
+
<hfoption id="Intel XPU">
|
99 |
+
|
100 |
+
```bash
|
101 |
+
ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
|
102 |
+
```
|
103 |
+
|
104 |
+
XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
|
105 |
+
순서를 바꾸려면 (XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
|
106 |
+
|
107 |
+
```bash
|
108 |
+
ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
|
109 |
+
```
|
110 |
+
|
111 |
+
|
112 |
+
다음으로 Intel XPU 순서를 제어할 수도 있습니다:
|
113 |
+
|
114 |
+
```bash
|
115 |
+
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
|
116 |
+
```
|
117 |
+
|
118 |
+
Intel XPU의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
|
119 |
+
|
120 |
+
</hfoption>
|
121 |
+
</hfoptions>
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
> [!WARNING]
|
126 |
+
> 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 결국 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신, 동일한 명령줄에서 특정 훈련 실행에 대해 환경 변수를 설정하는 것이 일반적인 관례입니다.
|
127 |
+
```
|
translator/__init__.py
ADDED
File without changes
|
translator/content.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
|
4 |
+
import requests
|
5 |
+
from langchain.callbacks import get_openai_callback
|
6 |
+
from langchain_anthropic import ChatAnthropic
|
7 |
+
|
8 |
+
|
9 |
+
def get_content(filepath: str) -> str:
|
10 |
+
url = string.Template(
|
11 |
+
"https://raw.githubusercontent.com/huggingface/" "transformers/main/$filepath"
|
12 |
+
).safe_substitute(filepath=filepath)
|
13 |
+
response = requests.get(url)
|
14 |
+
if response.status_code == 200:
|
15 |
+
content = response.text
|
16 |
+
return content
|
17 |
+
else:
|
18 |
+
raise ValueError("Failed to retrieve content from the URL.", url)
|
19 |
+
|
20 |
+
|
21 |
+
def preprocess_content(content: str) -> str:
|
22 |
+
# Extract text to translate from document
|
23 |
+
|
24 |
+
## ignore top license comment
|
25 |
+
to_translate = content[content.find("#") :]
|
26 |
+
## remove code blocks from text
|
27 |
+
to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
|
28 |
+
## remove markdown tables from text
|
29 |
+
to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
|
30 |
+
## remove empty lines from text
|
31 |
+
to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
|
32 |
+
|
33 |
+
return to_translate
|
34 |
+
|
35 |
+
|
36 |
+
def get_full_prompt(language: str, to_translate: str) -> str:
|
37 |
+
prompt = string.Template(
|
38 |
+
"What do these sentences about Hugging Face Transformers "
|
39 |
+
"(a machine learning library) mean in $language? "
|
40 |
+
"Please do not translate the word after a 🤗 emoji "
|
41 |
+
"as it is a product name. Output only the translated markdown result "
|
42 |
+
"without any explanations or introductions.\n\n```md"
|
43 |
+
).safe_substitute(language=language)
|
44 |
+
return "\n".join([prompt, to_translate.strip(), "```"])
|
45 |
+
|
46 |
+
|
47 |
+
def split_markdown_sections(markdown: str) -> list:
|
48 |
+
# Find all titles using regular expressions
|
49 |
+
return re.split(r"^(#+\s+)(.*)$", markdown, flags=re.MULTILINE)[1:]
|
50 |
+
# format is like [level, title, content, level, title, content, ...]
|
51 |
+
|
52 |
+
|
53 |
+
def get_anchors(divided: list) -> list:
|
54 |
+
anchors = []
|
55 |
+
# from https://github.com/huggingface/doc-builder/blob/01b262bae90d66e1150cdbf58c83c02733ed4366/src/doc_builder/build_doc.py#L300-L302
|
56 |
+
for title in divided[1::3]:
|
57 |
+
anchor = re.sub(r"[^a-z0-9\s]+", "", title.lower())
|
58 |
+
anchor = re.sub(r"\s{2,}", " ", anchor.strip()).replace(" ", "-")
|
59 |
+
anchors.append(f"[[{anchor}]]")
|
60 |
+
return anchors
|
61 |
+
|
62 |
+
|
63 |
+
def make_scaffold(content: str, to_translate: str) -> string.Template:
|
64 |
+
scaffold = content
|
65 |
+
for i, text in enumerate(to_translate.split("\n\n")):
|
66 |
+
scaffold = scaffold.replace(text, f"$hf_i18n_placeholder{i}", 1)
|
67 |
+
return string.Template(scaffold)
|
68 |
+
|
69 |
+
|
70 |
+
def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
|
71 |
+
scaffold = make_scaffold(content, to_translate)
|
72 |
+
divided = split_markdown_sections(to_translate)
|
73 |
+
anchors = get_anchors(divided)
|
74 |
+
|
75 |
+
translated = split_markdown_sections(translated)
|
76 |
+
|
77 |
+
translated[1::3] = [
|
78 |
+
f"{korean_title} {anchors[i]}"
|
79 |
+
for i, korean_title in enumerate(translated[1::3])
|
80 |
+
]
|
81 |
+
translated = "".join(
|
82 |
+
["".join(translated[i * 3 : i * 3 + 3]) for i in range(len(translated) // 3)]
|
83 |
+
).split("\n\n")
|
84 |
+
if newlines := scaffold.template.count("$hf_i18n_placeholder") - len(translated):
|
85 |
+
return str(
|
86 |
+
[
|
87 |
+
f"Please {'recover' if newlines > 0 else 'remove'} "
|
88 |
+
f"{abs(newlines)} incorrectly inserted double newlines."
|
89 |
+
]
|
90 |
+
)
|
91 |
+
|
92 |
+
translated_doc = scaffold.safe_substitute(
|
93 |
+
{f"hf_i18n_placeholder{i}": text for i, text in enumerate(translated)}
|
94 |
+
)
|
95 |
+
return translated_doc
|
96 |
+
|
97 |
+
|
98 |
+
def llm_translate(to_translate: str) -> tuple[str, str]:
|
99 |
+
with get_openai_callback() as cb:
|
100 |
+
model = ChatAnthropic(
|
101 |
+
model="claude-sonnet-4-20250514", max_tokens=64000, streaming=True
|
102 |
+
)
|
103 |
+
ai_message = model.invoke(to_translate)
|
104 |
+
print("cb:", cb)
|
105 |
+
return cb, ai_message.content
|
translator/model.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from enum import Enum, unique
|
2 |
+
|
3 |
+
from pydantic import BaseModel, computed_field
|
4 |
+
|
5 |
+
|
6 |
+
@unique
|
7 |
+
class Languages(Enum):
|
8 |
+
az = "az"
|
9 |
+
bn = "bn"
|
10 |
+
de = "de"
|
11 |
+
em = "em"
|
12 |
+
es = "es"
|
13 |
+
fa = "fa"
|
14 |
+
fr = "fr"
|
15 |
+
he = "he"
|
16 |
+
hu = "hu"
|
17 |
+
id = "id"
|
18 |
+
it = "it"
|
19 |
+
ja = "ja"
|
20 |
+
ko = "ko"
|
21 |
+
pl = "pl"
|
22 |
+
pt = "pt"
|
23 |
+
ru = "ru"
|
24 |
+
tr = "tr"
|
25 |
+
uk = "uk"
|
26 |
+
ur = "ur"
|
27 |
+
vi = "vi"
|
28 |
+
yo = "yo"
|
29 |
+
zh = "zh"
|
30 |
+
zh_hant = "zh-hant"
|
31 |
+
|
32 |
+
|
33 |
+
class TranslationDoc(BaseModel):
|
34 |
+
official_lang: str = "en"
|
35 |
+
translation_lang: str
|
36 |
+
original_file: str
|
37 |
+
translation_file: str | None = None
|
38 |
+
translation_exists: bool
|
39 |
+
|
40 |
+
|
41 |
+
class Summary(BaseModel):
|
42 |
+
lang: str
|
43 |
+
files_analyzed: int = 0
|
44 |
+
files_translated: int = 0
|
45 |
+
files_outdated: int = 0
|
46 |
+
files_missing_translation: int = 0
|
47 |
+
files: list[TranslationDoc] = []
|
48 |
+
|
49 |
+
@computed_field # type: ignore
|
50 |
+
@property
|
51 |
+
def percentage_missing_translation(self) -> float:
|
52 |
+
try:
|
53 |
+
return (
|
54 |
+
100 * float(self.files_missing_translation) / float(self.files_analyzed)
|
55 |
+
)
|
56 |
+
except Exception:
|
57 |
+
return 0.0
|
58 |
+
|
59 |
+
def append_file(self, doc: TranslationDoc) -> None:
|
60 |
+
self.files.append(doc)
|
61 |
+
self.files_analyzed += 1
|
62 |
+
|
63 |
+
if doc.translation_exists:
|
64 |
+
self.files_translated += 1
|
65 |
+
|
66 |
+
if not doc.translation_exists:
|
67 |
+
self.files_missing_translation += 1
|
68 |
+
|
69 |
+
def first_missing_translation_files(self, length: int = 10) -> list[TranslationDoc]:
|
70 |
+
return list(filter(lambda d: not d.translation_exists, self.files))[:length]
|
translator/retriever.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
import requests
|
5 |
+
|
6 |
+
from .model import Languages, Summary, TranslationDoc
|
7 |
+
|
8 |
+
URL = "https://api.github.com/repos/huggingface/transformers/git/trees/main?recursive=1"
|
9 |
+
|
10 |
+
|
11 |
+
def get_github_repo_files():
|
12 |
+
"""
|
13 |
+
Get github repo files
|
14 |
+
"""
|
15 |
+
response = requests.get(URL)
|
16 |
+
|
17 |
+
data = response.json()
|
18 |
+
all_items = data.get("tree", [])
|
19 |
+
|
20 |
+
file_paths = [
|
21 |
+
item["path"]
|
22 |
+
for item in all_items
|
23 |
+
if item["type"] == "blob" and (item["path"].startswith("docs"))
|
24 |
+
]
|
25 |
+
return file_paths
|
26 |
+
|
27 |
+
|
28 |
+
def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
|
29 |
+
"""
|
30 |
+
Retrieve missing docs
|
31 |
+
"""
|
32 |
+
|
33 |
+
report = f"""
|
34 |
+
| Item | Count | Percentage |
|
35 |
+
|------|-------|------------|
|
36 |
+
| 📂 HuggingFaces docs | {summary.files_analyzed} | - |
|
37 |
+
| 🪹 Missing translations | {summary.files_missing_translation} | {summary.percentage_missing_translation:.2f}% |
|
38 |
+
"""
|
39 |
+
print(report)
|
40 |
+
first_missing_docs = list()
|
41 |
+
for file in summary.first_missing_translation_files(table_size):
|
42 |
+
first_missing_docs.append(file.original_file)
|
43 |
+
|
44 |
+
print(first_missing_docs)
|
45 |
+
return report, first_missing_docs
|
46 |
+
|
47 |
+
|
48 |
+
def report(target_lang: str, top_k: int = 1) -> tuple[str, list[str]]:
|
49 |
+
"""
|
50 |
+
Generate a report for the translated docs
|
51 |
+
"""
|
52 |
+
docs_file = get_github_repo_files()
|
53 |
+
|
54 |
+
base_docs_path = Path("docs/source")
|
55 |
+
en_docs_path = Path("docs/source/en")
|
56 |
+
|
57 |
+
lang = Languages[target_lang]
|
58 |
+
summary = Summary(lang=lang.value)
|
59 |
+
|
60 |
+
for file in docs_file:
|
61 |
+
if file.endswith(".md"):
|
62 |
+
try:
|
63 |
+
file_relative_path = Path(file).relative_to(en_docs_path)
|
64 |
+
except ValueError:
|
65 |
+
continue
|
66 |
+
|
67 |
+
translated_path = os.path.join(
|
68 |
+
base_docs_path, lang.value, file_relative_path
|
69 |
+
)
|
70 |
+
translation_exists = translated_path in docs_file
|
71 |
+
|
72 |
+
doc = TranslationDoc(
|
73 |
+
translation_lang=lang.value,
|
74 |
+
original_file=file,
|
75 |
+
translation_file=translated_path,
|
76 |
+
translation_exists=translation_exists,
|
77 |
+
)
|
78 |
+
summary.append_file(doc)
|
79 |
+
return retrieve(summary, top_k)
|