用python爬取豆瓣丛书书目

V2.1 修复丛书页面有简介导致无法写入excel的bug

首发于2021-01-20,公众号链接:一个翻书号

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import requests
from bs4 import BeautifulSoup
import re
import openpyxl

series=input('series编号:')
ranges=int(input('爬取页数:'))

wb =openpyxl.Workbook()
ws=wb.active
ws['A1']='书名'
ws['B1']='出版日期'

namelist=[]
timelist=[]

url_head ='https://book.douban.com/series/'
for i in range(1,ranges):
url=url_head+series+'?page='+str(i)+'&order=time'
r=requests.get(url,headers={'user-agent':'Mozilla/5.0'})
r.encoding = 'utf-8'
html=r.text
soup=BeautifulSoup(html,"html.parser")

# for item in soup.find_all("h2"):#item=soup.find_all("h2")为啥不行?
for item in soup.select("h2 a"):#find_all("h2")在有简介的页面会多找不到简介的时间,无法写入excel。所以用更精准的选择方式。
name=item.get_text().split()
name2="".join(name)
namelist.append(name2)
#print(name2)
#print(namelist) 会这里打印会有多个list,不是一个list
#print(type(namelist))

for item2 in soup.find_all("div", class_="pub"):
time=item2.get_text().strip()#strip()split()[6]
reg=r'\d{4}-\d{1,2}|\d{4}年\d{1,2}月|\d{4}.\d{1,2}'
time2=re.findall(reg,str(time))#re后的格式是list,所以转成str后再写入list
time3=str(time2)#如果没有这一句,直接把time2写入timelis,会raise ValueError("Cannot convert {0!r} to Excel".format(value)
timelist.append(time3)
#print(time3)

for i in range(len(namelist)):
ws.cell(i+1,1).value=namelist[i]
ws.cell(i+1,2).value=timelist[i]

wb.save('00.目录.xlsx')