Zhihu.py

740次阅读

共计 1904 个字符，预计需要花费 5 分钟才能阅读完成。

爬取知乎日志

# coding:utf-8
import re
import HTMLParser
import urllib2
import requests
import sys
reload(sys)
sys.setdefaultencoding('utf-8')


def getHtml(url):
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    request = urllib2.Request(url,headers=headers)
    response = urllib2.urlopen(request)
    text = response.read()
    #print text
    return text

#获取超链接
def getUrl(html):
    #编译，提高效率
    pattern = re.compile('<a href="/story/(.*?)"',re.S)
    items = re.findall(pattern,html)
    #print items
    urls = []
    for item in items:
        #print item
        urls.append('http://daily.zhihu.com/story/'+item)
        #print urls[-1]
    return urls


#获取标题+文章
def getContent(url):
    html = getHtml(url)
    pattern = re.compile('<h1 class="headline-title">(.*?)</h1>')
    items = re.findall(pattern,html)
    print '**************************'+items[0]+'********************************'


    #匹配文章内容
    pattern = re.compile('<div class="content">\\n<p>(.*?)</div>',re.S)
    items_withtag = re.findall(pattern,html)
    print items_withtag
    for item in items_withtag:
        for content in characterProcessing(item):
            print content

#去掉文章中间的标签，连接
def characterProcessing(html):
    htmlParser = HTMLParser.HTMLParser()
    pattern = re.compile('<p>(.*?)</p>|<li>(.*?)</li>.*?', re.S)
    items = re.findall(pattern, html)
    result = []
    for index in items:

        if index != '':
            for content in index:
                tag = re.search('<.*?>', content)
                http = re.search('<.*?http.*?', content)
                html_tag = re.search('&', content)
                if html_tag:
                    content = htmlParser.unescape(content)

                if http:
                    continue
                elif tag:

                    pattern = re.compile('(.*?)<.*?>(.*?)</.*?>(.*)')
                    items = re.findall(pattern, content)
                    content_tags = ''
                    if len(items) > 0:
                        for item in items:
                            if len(item) > 0:
                                for item_s in item:
                                    content_tags = content_tags + item_s
                            else:
                                content_tags = content_tags + item_s
                        content_tags = re.sub('<.*?>', '', content_tags)
                        result.append(content_tags)
                    else:
                        continue
                else:
                    result.append(content)
    return result
def main():
    url = 'http://daily.zhihu.com/'
    html = getHtml(url)
    urls = getUrl(html)
    for url in urls:
        try:
            getContent(url)
        except Exception,e:
            print e
if __name__ == "__main__":
    main()

正文完

发表至： Python

2017-06-21