openai-agents-python/docs/scripts/translate_docs.py
Kazuhiro Sera 360f173b73
Evolve the doc translation workflow by using gpt-4.1 (#507)
This pull request enhances the document translation workflow by
switching to the new GPT-4.1 model. The generator script’s prompt now
includes a “workflow” section that guides the model to iterate
self-reviews on its outputs to autonomously achieve the highest quality.
This addition has noticeably improved the naturalness and consistency of
the wording in the translated outputs.
2025-04-14 22:04:07 -04:00

267 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# ruff: noqa
import os
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor
# import logging
# logging.basicConfig(level=logging.INFO)
# logging.getLogger("openai").setLevel(logging.DEBUG)
OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4.1")
# Define the source and target directories
source_dir = "docs"
languages = {
"ja": "Japanese",
# Add more languages here, e.g., "fr": "French"
}
# Initialize OpenAI client
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# Define dictionaries for translation control
do_not_translate = [
"OpenAI",
"Agents SDK",
"Hello World",
"Model context protocol",
"structured outputs",
"Chain-of-Thought",
"Chat Completions",
"Computer-Using Agent",
"Code Interpreter",
"Function Calling",
"LLM",
"Operator",
"Playground",
"Realtime API",
"Sora",
# Add more terms here
]
eng_to_non_eng_mapping = {
"ja": {
"agents": "エージェント",
"computer use": "コンピュータ操作",
"OAI hosted tools": "OpenAI がホストするツール",
"well formed data": "適切な形式のデータ",
"guardrail": "ガードレール",
"handoffs": "ハンドオフ",
"function tools": "関数ツール",
"tracing": "トレーシング",
"code examples": "コード例",
"vector store": "ベクトルストア",
"deep research": "ディープリサーチ",
"category": "カテゴリー",
"user": "ユーザー",
"parameter": "パラメーター",
"processor": "プロセッサー",
"server": "サーバー",
"web search": "Web 検索",
"file search": "ファイル検索",
"streaming": "ストリーミング",
"system prompt": "システムプロンプト",
"Python first": "Python ファースト",
# Add more Japanese mappings here
},
# Add more languages here
}
eng_to_non_eng_instructions = {
"common": [
"* The term 'examples' must be code examples when the page mentions the code examples in the repo, it can be translated as either 'code examples' or 'sample code'.",
"* The term 'primitives' can be translated as basic components.",
"* When the terms 'instructions' and 'tools' are mentioned as API parameter names, they must be kept as is.",
"* The terms 'temperature', 'top_p', 'max_tokens', 'presence_penalty', 'frequency_penalty' as parameter names must be kept as is.",
],
"ja": [
"* The term 'result' in the Runner guide context must be translated like 'execution results'",
"* The term 'raw' in 'raw response events' must be kept as is",
"* You must consistently use polite wording such as です/ます rather than である/なのだ.",
# Add more Japanese mappings here
],
# Add more languages here
}
def built_instructions(target_language: str, lang_code: str) -> str:
do_not_translate_terms = "\n".join(do_not_translate)
specific_terms = "\n".join(
[f"* {k} -> {v}" for k, v in eng_to_non_eng_mapping.get(lang_code, {}).items()]
)
specific_instructions = "\n".join(
eng_to_non_eng_instructions.get("common", [])
+ eng_to_non_eng_instructions.get(lang_code, [])
)
return f"""You are an expert technical translator.
Your task: translate the markdown passed as a user input from English into {target_language}.
The inputs are the official OpenAI Agents SDK framework documentation, and your translation outputs'll be used for serving the official {target_language} version of them. Thus, accuracy, clarity, and fidelity to the original are critical.
############################
## OUTPUT REQUIREMENTS ##
############################
You must return **only** the translated markdown. Do not include any commentary, metadata, or explanations. The original markdown structure must be strictly preserved.
#########################
## GENERAL RULES ##
#########################
- Be professional and polite.
- Keep the tone **natural** and concise.
- Do not omit any content. If a segment should stay in English, copy it verbatim.
- Do not change the markdown data structure, including the indentations.
- Keep all placeholders such as `CODE_BLOCK_*` and `CODE_LINE_PREFIX` unchanged.
- Convert asset paths: `./assets/…` → `../assets/…`.
*Example:* `![img](./assets/pic.png)` → `![img](../assets/pic.png)`
- Treat the **DoNotTranslate list** and **TermSpecific list** as caseinsensitive; preserve the original casing you see.
- Skip translation for:
- Inline code surrounded by single backticks ( `like_this` ).
- Fenced code blocks delimited by ``` or ~~~, including all comments inside them.
- Link URLs inside `[label](URL)` translate the label, never the URL.
#########################
## LANGUAGESPECIFIC ##
#########################
*(applies only when {target_language} = Japanese)*
- Insert a halfwidth space before and after all alphanumeric terms.
- Add a halfwidth space just outside markdown emphasis markers: ` **太字** ` (good) vs `** 太字 **` (bad).
#########################
## DO NOT TRANSLATE ##
#########################
When replacing the following terms, do not have extra spaces before/after them:
{do_not_translate_terms}
#########################
## TERMSPECIFIC ##
#########################
Translate these terms exactly as provided (no extra spaces):
{specific_terms}
#########################
## EXTRA GUIDELINES ##
#########################
{specific_instructions}
#########################
## IF UNSURE ##
#########################
If you are uncertain about a term, leave the original English term in parentheses after your translation.
#########################
## WORKFLOW ##
#########################
Follow the following workflow to translate the given markdown text data:
1. Read the input markdown text given by the user.
2. Translate the markdown file into {target_language}, carefully following the requirements above.
3. Perform a self-review to evaluate the quality of the translation, focusing on naturalness, accuracy, and consistency in detail.
4. If improvements are necessary, refine the content without changing the original meaning.
5. Continue improving the translation until you are fully satisfied with the result.
6. Once the final output is ready, return **only** the translated markdown text. No extra commentary.
"""
# Function to translate and save files
def translate_file(file_path: str, target_path: str, lang_code: str) -> None:
print(f"Translating {file_path} into a different language: {lang_code}")
with open(file_path, encoding="utf-8") as f:
content = f.read()
# Split content into lines
lines: list[str] = content.splitlines()
chunks: list[str] = []
current_chunk: list[str] = []
# Split content into chunks of up to 120 lines, ensuring splits occur before section titles
in_code_block = False
code_blocks: list[str] = []
code_block_chunks: list[str] = []
for line in lines:
if len(current_chunk) >= 120 and not in_code_block and line.startswith("#"):
chunks.append("\n".join(current_chunk))
current_chunk = []
if line.strip().startswith("```"):
code_block_chunks.append(line)
if in_code_block is True:
code_blocks.append("\n".join(code_block_chunks))
current_chunk.append(f"CODE_BLOCK_{(len(code_blocks) - 1):02}")
code_block_chunks.clear()
in_code_block = not in_code_block
continue
if in_code_block is True:
code_block_chunks.append(line)
else:
current_chunk.append(line)
if current_chunk:
chunks.append("\n".join(current_chunk))
# Translate each chunk separately and combine results
translated_content: list[str] = []
for chunk in chunks:
instructions = built_instructions(languages[lang_code], lang_code)
if OPENAI_MODEL.startswith("o"):
response = openai_client.responses.create(
model=OPENAI_MODEL,
instructions=instructions,
input=chunk,
)
translated_content.append(response.output_text)
else:
response = openai_client.responses.create(
model=OPENAI_MODEL,
instructions=instructions,
input=chunk,
temperature=0.0,
)
translated_content.append(response.output_text)
translated_text = "\n".join(translated_content)
for idx, code_block in enumerate(code_blocks):
translated_text = translated_text.replace(f"CODE_BLOCK_{idx:02}", code_block)
# Save the combined translated content
with open(target_path, "w", encoding="utf-8") as f:
f.write(translated_text)
def translate_single_source_file(file_path: str) -> None:
relative_path = os.path.relpath(file_path, source_dir)
if "ref/" in relative_path or not file_path.endswith(".md"):
return
for lang_code in languages:
target_dir = os.path.join(source_dir, lang_code)
target_path = os.path.join(target_dir, relative_path)
# Ensure the target directory exists
os.makedirs(os.path.dirname(target_path), exist_ok=True)
# Translate and save the file
translate_file(file_path, target_path, lang_code)
def main():
# Traverse the source directory
for root, _, file_names in os.walk(source_dir):
# Skip the target directories
if any(lang in root for lang in languages):
continue
# Increasing this will make the translation faster; you can decide considering the model's capacity
concurrency = 6
with ThreadPoolExecutor(max_workers=concurrency) as executor:
futures = []
for file_name in file_names:
filepath = os.path.join(root, file_name)
futures.append(executor.submit(translate_single_source_file, filepath))
if len(futures) >= concurrency:
for future in futures:
future.result()
futures.clear()
print("Translation completed.")
if __name__ == "__main__":
# translate_single_source_file("docs/index.md")
main()