使用使用Python编写爬虫的基本模块及框架使用指南编写爬虫的基本模块及框架使用指南
基本模块基本模块
python爬虫,web spider。爬取网站获取网页数据,并进行分析提取。
基本模块使用的是 urllib,urllib2,re,等模块
基本用法,例子:
(1)进行基本GET请求,获取网页html
#!coding=utf-8
import urllib
import urllib2
url = 'http://www.baidu.com/'
# 获取请求
request = urllib2.Request(url)
try:
# 根据request,得到返回response
response = urllib2.urlopen(request)
except urllib2.HTTPError, e:
if hasattr(e, 'reason'):
print e.reason
# 读取response的body
html = response.read()
# 读取response的headers
headers = response.info()
(2)表单提交
#!coding=utf-8
import urllib2
import urllib
post_url = ''
post_data = urllib.urlencode({
'username': 'username',
'password': 'password',
})
post_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0',
}
request = urllib2.Request(
url=post_url,
data=post_data,
headers=post_headers,
)
response = urllib2.urlopen(request)
html = response.read()
(3)
#!coding=utf-8
import urllib2
import re
page_num = 1
url = 'http://tieba.baidu.com/p/3238280985?see_lz=1&pn='+str(page_num)
myPage = urllib2.urlopen(url).read().decode('gbk')
myRe = re.compile(r'class="d_post_content j_d_post_content ">(.*?)</div>', re.DOTALL)