Python爬虫初尝试

PS:看小说总有一些奇怪的广告,不知道的人还以为我在看奇怪的东西..

所以把小说下载下来,存到自己服务器去

第一步:2种方式下载页面

方式1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# _*_ coding:utf-8 _*_
import urllib.request
import urllib.parse
import lxml.html
import lxml.cssselect
import os

//自己写个下载的子程序:
def download(url, time=2, data=None, user_agent="test", proxy=None):
print("downloading.....", url)
headers = {'User-agent': user_agent, "accept-language": "zh-CN,zh;q=0.9,en;q=0.8"}
request = urllib.request.Request(url, data, headers=headers)
opener = urllib.request.build_opener()
proxy_parm = {urllib.parse.urlparse(url).scheme: proxy}
try:
if proxy:
opener.add_handler(urllib.request.ProxyHandler(proxy_parm))
res = opener.open(request).read()
except urllib.request.URLError as e:
print("download error ",e.reason)
res = None
if time > 0:
download(url, time-1)
else:
print("Success")
return str(res, encoding="utf-8")

方式2:

1
2
3
Import requests
r = requests.get("http://www.baidu.com")
#r.text 即为web内容

注 :遇到问题多使用 dir 和 help 函数来查看相关函数的使用方法

第二步:解析页面url

1
2
3
4
5
6
7
8
9
10
11
def getLinks(html):
retList = []
tree = lxml.html.fromstring(html)
data = tree.cssselect("dl")[0]
for i in data:
tag = i.cssselect("a")[0]
page = {}
page["title"] = tag.get("title")
page["href"] = tag.get("href")
retList.append(page)
return retList

这里用的是 lxml 模块,效率和 re 正则库差不多,详情百度 lxml 就可以查阅到相关资料

第三步:解析页面

1
2
3
4
5
def getContent(page):
html = download(page)
tree = lxml.html.fromstring(html)
data = tree.get_element_by_id("content", None)
return data.text_content()

第四步:写到文件

1
2
3
4
5
6
def writeText(text, dir, i):
if not os.path.isdir(dir):
os.mkdir(dir)
with open(dir+"/"+str(i)+".txt", "w", encoding="utf-8") as data:
data.truncate()
data.write(text)

第五步:转换下格式

1
2
def getUrl(home, href):
return home + href

第六步:完成整体工作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def main(url, dir):
home = download(url)
linkList = getLinks(home)
size = 1
for dic in linkList:
writeDic = {}
writeDic["title"] = dic["title"]
writeDic["content"] = getContent(getUrl(url, dic["href"]))
writeText(str(writeDic).replace(r"\xa0\xa0\xa0\xa0","\r\n ").replace(r"\n","\n"), dir, size)
size = size + 1
urlFrxxz = "http://www.biquke.com/bq/0/990/"
urlLwcs = "http://www.biquke.com/bq/22/22565/"
urlWldf = "http://www.biquke.com/bq/0/362/"
urlXhdtgs = "http://www.biquke.com/bq/0/98/"
main(urlFrxxz,"frxxz")
main(urlLwcs,"lwcs")
main(urlWldf,"wldf")
main(urlXhdtgs,"xhdtsgs")