for title in novelList: # print(title.get_text()) # 小说的章节名称 novel_title.append(str(title.get_text())) # print(title.a['href']) # 小说的链接 /dushu/41/41357/15990708.html # https://www.read8.net/dushu/41/41357/15990708.html novel_link.append(str(novel_link_pre + title.a['href']))
# 爬取单独一章里的小说主体 defgetChapterText(baseurl): html = requests.get(baseurl) soup = BeautifulSoup(html.text, 'html.parser') text = soup.find(id="content") #内容 text = str(text.get_text())
text = text.replace(" ","\n ") #增加换行 text = text.replace("天才一秒记住本站地址:https://www.read8.net","") #去除广告 text = text.replace("网上直接搜索: ”(阅)(读)(悦)” 20万本热门小说免费看,,精彩!","") text = text.replace("章节错误,点此报送,报送后维护人员会在两分钟内校正章节内容,请耐心等待。","") text = text.replace("转载请注明出处:https://www.read8.net","") text = text.replace("《神武主宰》来源:", "") text = text.replace("https://www.read8.net", "")
return text
# 将内容写入到txt文件中 defsaveTxt(title,text,filename): withopen(filename, 'a',encoding='utf-8') as file: file.write(title+"\n\n") file.write(text)
getChapter(baseurl, novel_link_pre) for i inrange(novel_begin_index,len(novel_title)): text = getChapterText(novel_link[i]) saveTxt(novel_title[i],text,filename) print("已经爬取:",novel_title[i],"进度: (",i-novel_begin_index+1,"/",len(novel_title)-novel_begin_index,")") time.sleep(1)