Zhihu.py

227次阅读
没有评论

爬取知乎日志

# coding:utf-8
import re
import HTMLParser
import urllib2
import requests
import sys
reload(sys)
sys.setdefaultencoding('utf-8')


def getHtml(url):
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    request = urllib2.Request(url,headers=headers)
    response = urllib2.urlopen(request)
    text = response.read()
    #print text
    return text

#获取超链接
def getUrl(html):
    #编译,提高效率
    pattern = re.compile('<a href="/story/(.*?)"',re.S)
    items = re.findall(pattern,html)
    #print items
    urls = []
    for item in items:
        #print item
        urls.append('http://daily.zhihu.com/story/'+item)
        #print urls[-1]
    return urls


#获取标题+文章
def getContent(url):
    html = getHtml(url)
    pattern = re.compile('<h1 class="headline-title">(.*?)</h1>')
    items = re.findall(pattern,html)
    print '**************************'+items[0]+'********************************'


    #匹配文章内容
    pattern = re.compile('<div class="content">\\n<p>(.*?)</div>',re.S)
    items_withtag = re.findall(pattern,html)
    print items_withtag
    for item in items_withtag:
        for content in characterProcessing(item):
            print content

#去掉文章中间的标签,连接
def characterProcessing(html):
    htmlParser = HTMLParser.HTMLParser()
    pattern = re.compile('<p>(.*?)</p>|<li>(.*?)</li>.*?', re.S)
    items = re.findall(pattern, html)
    result = []
    for index in items:

        if index != '':
            for content in index:
                tag = re.search('<.*?>', content)
                http = re.search('<.*?http.*?', content)
                html_tag = re.search('&', content)
                if html_tag:
                    content = htmlParser.unescape(content)

                if http:
                    continue
                elif tag:

                    pattern = re.compile('(.*?)<.*?>(.*?)</.*?>(.*)')
                    items = re.findall(pattern, content)
                    content_tags = ''
                    if len(items) > 0:
                        for item in items:
                            if len(item) > 0:
                                for item_s in item:
                                    content_tags = content_tags + item_s
                            else:
                                content_tags = content_tags + item_s
                        content_tags = re.sub('<.*?>', '', content_tags)
                        result.append(content_tags)
                    else:
                        continue
                else:
                    result.append(content)
    return result
def main():
    url = 'http://daily.zhihu.com/'
    html = getHtml(url)
    urls = getUrl(html)
    for url in urls:
        try:
            getContent(url)
        except Exception,e:
            print e
if __name__ == "__main__":
    main()

 

Honest1y
版权声明:本站原创文章,由Honest1y2017-06-21发表,共计1904字。
转载提示:除特殊说明外本站文章皆由CC-4.0协议发布,转载请注明出处。
评论(没有评论)
载入中...