url_head ='https://book.douban.com/series/' for i in range(1,ranges): url=url_head+series+'?page='+str(i)+'&order=time' r=requests.get(url,headers={'user-agent':'Mozilla/5.0'}) r.encoding = 'utf-8' html=r.text soup=BeautifulSoup(html,"html.parser")
# for item in soup.find_all("h2"):#item=soup.find_all("h2")为啥不行? for item in soup.select("h2 a"):#find_all("h2")在有简介的页面会多找不到简介的时间,无法写入excel。所以用更精准的选择方式。 name=item.get_text().split() name2="".join(name) namelist.append(name2) #print(name2) #print(namelist) 会这里打印会有多个list,不是一个list #print(type(namelist)) for item2 in soup.find_all("div", class_="pub"): time=item2.get_text().strip()#strip()split()[6] reg=r'\d{4}-\d{1,2}|\d{4}年\d{1,2}月|\d{4}.\d{1,2}' time2=re.findall(reg,str(time))#re后的格式是list,所以转成str后再写入list time3=str(time2)#如果没有这一句,直接把time2写入timelis,会raise ValueError("Cannot convert {0!r} to Excel".format(value) timelist.append(time3) #print(time3)
for i in range(len(namelist)): ws.cell(i+1,1).value=namelist[i] ws.cell(i+1,2).value=timelist[i]