1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
| #!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
使用Playwright自动化工具抓取历史年表URL
从公元前841年开始,通过"下一篇"链接逐页获取
保存为JSON文件,按时间排序
"""
import json
import os
import re
import sys
from datetime import datetime
try:
from playwright.sync_api import sync_playwright
except ImportError:
print("请安装playwright: pip install playwright")
print("并运行: playwright install chromium")
sys.exit(1)
# ============== 配置区域 ==============
START_URL = 'https://www.ilishi.com.cn/2024/101776.html' # 起始页面(公元前841年)
OUTPUT_FILE = 'zhou_ilishi_history_urls.json'
HEADLESS = True # 无头模式
TIMEOUT = 30000 # 页面加载超时(毫秒)
# ======================================
def extract_year_from_title(title):
"""从页面标题提取年份"""
# 匹配 "公元前X年" 或 "公元X年"
match = re.search(r'(公元前\d+年|公元\d+年)', title)
if match:
year_str = match.group(1)
# 转换为数字
if '公元前' in year_str:
year = -int(re.search(r'\d+', year_str).group())
else:
year = int(re.search(r'\d+', year_str).group())
return year, year_str
return None, None
def scrape_history_urls(start_url, output_file):
"""抓取历史年表URL"""
results = []
visited = set() # 避免重复访问
page_num = 0
max_pages = 500 # 最大页数限制,防止死循环
print(f"开始抓取,起始URL: {start_url}")
print("=" * 50)
with sync_playwright() as p:
# 启动浏览器
browser = p.chromium.launch(headless=HEADLESS)
context = browser.new_context()
page = context.new_page()
current_url = start_url
while current_url and page_num < max_pages:
page_num += 1
# 检查是否已访问
if current_url in visited:
print(f"[{page_num}] 跳过已访问: {current_url}")
break
visited.add(current_url)
try:
# 访问页面
print(f"[{page_num}] 访问: {current_url}")
page.goto(current_url, timeout=TIMEOUT)
# 获取页面标题
title = page.title()
print(f" 标题: {title}")
# 提取年份
year, year_str = extract_year_from_title(title)
if year is None:
print(f" [警告] 无法从标题提取年份: {title}")
# 尝试从页面内容提取
content = page.content()
year_match = re.search(r'(公元前\d+年|公元\d+年)', content)
if year_match:
year_str = year_match.group(1)
if '公元前' in year_str:
year = -int(re.search(r'\d+', year_str).group())
else:
year = int(re.search(r'\d+', year_str).group())
print(f" 从内容提取年份: {year_str}")
else:
print(f" [停止] 无法提取年份,终止抓取")
break
# 保存结果
results.append({
'year': year,
'year_str': year_str,
'title': title.replace(' - 爱历史网', ''),
'url': current_url
})
print(f" 年份: {year_str}, URL: {current_url}")
# 查找"下一篇"链接
next_link = None
# 方法1: 通过CSS选择器查找
try:
# 查找包含"下一篇"的元素
next_element = page.locator('text=下一篇').first
if next_element.count() > 0:
# 获取父级链接
next_link = next_element.locator('xpath=../a').get_attribute('href')
except:
pass
# 方法2: 通过HTML结构查找
if not next_link:
try:
# 查找 class="fr chao f16" 的 p 标签中的链接
next_link = page.evaluate('''
() => {
const elements = document.querySelectorAll('.fr.chao.f16');
for (const el of elements) {
const link = el.querySelector('a');
if (link && link.href) {
return link.href;
}
}
return null;
}
''')
except:
pass
# 方法3: 通过正则匹配HTML
if not next_link:
html = page.content()
match = re.search(r'下一篇:<a href="(/2024/\d+\.html)">([^<]+)</a>', html)
if match:
next_link = 'https://www.ilishi.com.cn' + match.group(1)
print(f" 通过正则找到下一篇: {next_link}")
if next_link:
# 确保是完整URL
if next_link.startswith('/'):
next_link = 'https://www.ilishi.com.cn' + next_link
# 检查是否是有效的下一页
# if '10260' in next_link or '/2024/1026' in next_link:
if 'fl' in next_link or 'jn' in next_link:
print(f" [停止] 下一篇链接异常: {next_link}")
break
else:
current_url = next_link
print(f" -> 下一篇: {current_url}\n")
else:
print(f" [停止] 未找到下一页链接")
break
except Exception as e:
print(f" [错误] 处理页面失败: {e}")
break
browser.close()
# 按年份排序
print("\n" + "=" * 50)
print("按年份排序...")
sorted_results = sorted(results, key=lambda x: x['year'])
# 保存为JSON
print(f"保存到: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(sorted_results, f, ensure_ascii=False, indent=2)
print(f"\n抓取完成! 共获取 {len(sorted_results)} 条记录")
# 显示部分结果
print("\n前5条:")
for item in sorted_results[:5]:
print(f" {item['year_str']}: {item['url']}")
print("\n后5条:")
for item in sorted_results[-5:]:
print(f" {item['year_str']}: {item['url']}")
return sorted_results
def main():
"""主函数"""
print("=" * 50)
print("历史年表URL抓取工具")
print("=" * 50)
print(f"起始URL: {START_URL}")
print(f"输出文件: {OUTPUT_FILE}")
print(f"无头模式: {HEADLESS}")
print("=" * 50 + "\n")
# 抓取
results = scrape_history_urls(START_URL, OUTPUT_FILE)
print("\n完成!")
if __name__ == '__main__':
main()
|