共计 1904 个字符,预计需要花费 5 分钟才能阅读完成。
爬取知乎日志
# coding:utf-8
import re
import HTMLParser
import urllib2
import requests
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def getHtml(url):
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
request = urllib2.Request(url,headers=headers)
response = urllib2.urlopen(request)
text = response.read()
#print text
return text
#获取超链接
def getUrl(html):
#编译,提高效率
pattern = re.compile('<a href="/story/(.*?)"',re.S)
items = re.findall(pattern,html)
#print items
urls = []
for item in items:
#print item
urls.append('http://daily.zhihu.com/story/'+item)
#print urls[-1]
return urls
#获取标题+文章
def getContent(url):
html = getHtml(url)
pattern = re.compile('<h1 class="headline-title">(.*?)</h1>')
items = re.findall(pattern,html)
print '**************************'+items[0]+'********************************'
#匹配文章内容
pattern = re.compile('<div class="content">\\n<p>(.*?)</div>',re.S)
items_withtag = re.findall(pattern,html)
print items_withtag
for item in items_withtag:
for content in characterProcessing(item):
print content
#去掉文章中间的标签,连接
def characterProcessing(html):
htmlParser = HTMLParser.HTMLParser()
pattern = re.compile('<p>(.*?)</p>|<li>(.*?)</li>.*?', re.S)
items = re.findall(pattern, html)
result = []
for index in items:
if index != '':
for content in index:
tag = re.search('<.*?>', content)
http = re.search('<.*?http.*?', content)
html_tag = re.search('&', content)
if html_tag:
content = htmlParser.unescape(content)
if http:
continue
elif tag:
pattern = re.compile('(.*?)<.*?>(.*?)</.*?>(.*)')
items = re.findall(pattern, content)
content_tags = ''
if len(items) > 0:
for item in items:
if len(item) > 0:
for item_s in item:
content_tags = content_tags + item_s
else:
content_tags = content_tags + item_s
content_tags = re.sub('<.*?>', '', content_tags)
result.append(content_tags)
else:
continue
else:
result.append(content)
return result
def main():
url = 'http://daily.zhihu.com/'
html = getHtml(url)
urls = getUrl(html)
for url in urls:
try:
getContent(url)
except Exception,e:
print e
if __name__ == "__main__":
main()
正文完