1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
| from typing import List, Dict, Any
import ollama
import re
import time
from tqdm import tqdm
from dotenv import load_dotenv
import os
# 加载环境变量
load_dotenv()
# ===================== 全局配置(专为 translategemma:4b 优化)=====================
MODEL_NAME = os.getenv("MODEL_NAME", "translategemma:4b")
INPUT_FILE = "yearandyear.md"
OUTPUT_FILE = "translated_reflect_final.md"
# ⬇️⬇️⬇️ 关键修改:适配 4B 模型上下文窗口 2K,滑动窗口 1024
# 字符数控制在 500 以内 = ~350 tokens,绝对安全不超限
MAX_BLOCK_LENGTH = 500
MAX_REFLECT_ITERATIONS = 1
# 正则:保护图片、链接不被翻译
IMG_PATTERN = r'!\[.*?\]\(.*?\)'
LINK_PATTERN = r'\[.*?\]\(.*?\)'
# ===================== 社会学专业术语表(自动统一翻译)=====================
# 你可以在这里添加你书籍里的关键名词!!!
SOCIOLOGY_TERMS = {
"social structure": "社会结构",
"capital": "资本",
"cultural capital": "文化资本",
"social class": "社会阶层",
"power": "权力",
"ideology": "意识形态",
"modernity": "现代性",
"rationality": "理性",
"institution": "制度",
"identity": "身份认同",
"legitimacy": "合法性",
"hegemony": "霸权",
"agency": "能动性",
"structure": "结构"
}
# ===================== 记忆模块 =====================
class Memory:
"""存储翻译与反思记录,用于迭代优化"""
def __init__(self):
self.records: List[Dict[str, Any]] = []
def add_record(self, record_type: str, content: str):
self.records.append({"type": record_type, "content": content})
def get_last_execution(self) -> str:
"""获取最后一次翻译结果"""
for record in reversed(self.records):
if record['type'] == 'execution':
return record['content']
return ""
def get_trajectory(self) -> str:
"""获取完整翻译-反思轨迹"""
trajectory = ""
for record in self.records:
if record['type'] == 'execution':
trajectory += f"【翻译结果】\n{record['content']}\n\n"
elif record['type'] == 'reflection':
trajectory += f"【反思建议】\n{record['content']}\n\n"
return trajectory.strip()
# ===================== 反思型翻译智能体 =====================
# 1. 初始翻译提示词(强制术语统一 + 社会学专业)
INIT_TRANSLATE_PROMPT = """
你是专业的现代社会学著作翻译专家,必须严格遵守:
【翻译规则】
1. 术语必须统一:以下词汇必须固定翻译,全程不能变
{terms_list}
2. 格式严格保留:所有Markdown标题、粗体、列表、段落格式完全不变
3. 忠于原文:不增、不减、不改意思
4. 语言流畅:符合中文学术规范
5. 只输出译文,无任何多余文字
待翻译文本:
{content}
"""
# 2. 反思提示词(重点检查:术语统一 + 格式)
REFLECT_PROMPT = """
你是严格的社会学翻译评审,请检查:
1. 关键名词是否全程统一翻译
2. 有无错译、漏译
3. Markdown格式是否完整保留
4. 语句是否通顺
只输出改进建议,完美则输出:【无需改进】
译文:
{translation}
"""
# 3. 优化翻译提示词
REFINE_TRANSLATE_PROMPT = """
你是社会学翻译专家,请根据反思优化译文:
1. 严格统一术语
2. 保留格式
3. 只输出最终译文
历史记录:
{trajectory}
"""
class ReflectTranslationAgent:
def __init__(self, model_name: str, max_iterations=1):
self.model_name = model_name
self.memory = Memory()
self.max_iterations = max_iterations
def _llm(self, prompt: str) -> str:
"""调用本地Ollama模型"""
try:
resp = ollama.chat(
model=self.model_name,
messages=[{"role": "user", "content": prompt}]
)
return resp['message']['content'].strip()
except Exception as e:
print(f"模型调用失败:{str(e)}")
return ""
def translate_block(self, content: str) -> str:
self.memory.records = []
terms_text = "\n".join([f"- {k} → {v}" for k, v in SOCIOLOGY_TERMS.items()])
# 初始翻译
init_prompt = INIT_TRANSLATE_PROMPT.format(
terms_list=terms_text,
content=content
)
first_trans = self._llm(init_prompt)
self.memory.add_record("execution", first_trans)
# 反思迭代
for _ in range(self.max_iterations):
last_trans = self.memory.get_last_execution()
reflect_prompt = REFLECT_PROMPT.format(translation=last_trans)
feedback = self._llm(reflect_prompt)
self.memory.add_record("reflection", feedback)
if "无需改进" in feedback:
break
refine_prompt = REFINE_TRANSLATE_PROMPT.format(trajectory=self.memory.get_trajectory())
refined = self._llm(refine_prompt)
self.memory.add_record("execution", refined)
return self.memory.get_last_execution()
# ===================== 分块函数(适配4b模型)=====================
def split_markdown_blocks(text, max_len=500):
text = text.replace("\r\n", "\n")
lines = text.split("\n")
blocks = []
current_block = []
current_length = 0
for line in lines:
line_len = len(line)
if current_length + line_len + 1 <= max_len:
current_block.append(line)
current_length += line_len + 1
else:
if current_block:
blocks.append("\n".join(current_block))
current_block = [line]
current_length = line_len
if current_block:
blocks.append("\n".join(current_block))
blocks = [b.strip() for b in blocks if b.strip()]
return blocks
# ===================== 工具函数 =====================
def extract_placeholders(text, pattern, prefix):
holders = {}
matches = re.findall(pattern, text, re.DOTALL)
for i, m in enumerate(matches):
ph = f"[{prefix}_{i:03d}]"
holders[ph] = m
text = text.replace(m, ph)
return text, holders
def restore_placeholders(text, holders):
for ph, original in holders.items():
text = text.replace(ph, original)
return text
# ===================== 主流程 =====================
def main():
print("=" * 60)
print(" 反思型社会学翻译Agent(适配translategemma:4b)")
print("=" * 60)
# 检查Ollama
try:
ollama.list()
except:
print("❌ 请先启动 ollama serve")
return
# 读取文件
try:
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
raw = f.read()
print(f"✅ 读取文件:{INPUT_FILE},{len(raw)} 字符")
except:
print(f"❌ 文件不存在")
return
# 保护链接图片
text, img_ph = extract_placeholders(raw, IMG_PATTERN, "IMG")
text, link_ph = extract_placeholders(text, LINK_PATTERN, "LINK")
print(f"✅ 保护 {len(img_ph)} 图 {len(link_ph)} 链接")
# 分块
blocks = split_markdown_blocks(text, MAX_BLOCK_LENGTH)
print(f"✅ 文档分为 {len(blocks)} 块(安全适配4B模型)")
if len(blocks) == 0:
print("❌ 无内容")
return
# 翻译
agent = ReflectTranslationAgent(MODEL_NAME, MAX_REFLECT_ITERATIONS)
results = []
print("\n开始翻译...\n")
for block in tqdm(blocks, desc="翻译进度"):
trans = agent.translate_block(block)
results.append(trans)
time.sleep(0.4)
# 合并输出
final = "\n\n".join(results)
final = restore_placeholders(final, link_ph)
final = restore_placeholders(final, img_ph)
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
f.write(final)
print("\n🎉 翻译完成!")
print(f"📄 输出文件:{OUTPUT_FILE}")
print("✅ 术语统一 ✅ 格式保留 ✅ 适配4B模型")
if __name__ == "__main__":
main()
|