爬取知乎日志
# coding:utf-8 import re import HTMLParser import urllib2 import requests import sys reload(sys) sys.setdefaultencoding('utf-8') def getHtml(url): headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} request = urllib2.Request(url,headers=headers) response = urllib2.urlopen(request) text = response.read() #print text return text #获取超链接 def getUrl(html): #编译,提高效率 pattern = re.compile('<a href="/story/(.*?)"',re.S) items = re.findall(pattern,html) #print items urls = [] for item in items: #print item urls.append('http://daily.zhihu.com/story/'+item) #print urls[-1] return urls #获取标题+文章 def getContent(url): html = getHtml(url) pattern = re.compile('<h1 class="headline-title">(.*?)</h1>') items = re.findall(pattern,html) print '**************************'+items[0]+'********************************' #匹配文章内容 pattern = re.compile('<div class="content">\\n<p>(.*?)</div>',re.S) items_withtag = re.findall(pattern,html) print items_withtag for item in items_withtag: for content in characterProcessing(item): print content #去掉文章中间的标签,连接 def characterProcessing(html): htmlParser = HTMLParser.HTMLParser() pattern = re.compile('<p>(.*?)</p>|<li>(.*?)</li>.*?', re.S) items = re.findall(pattern, html) result = [] for index in items: if index != '': for content in index: tag = re.search('<.*?>', content) http = re.search('<.*?http.*?', content) html_tag = re.search('&', content) if html_tag: content = htmlParser.unescape(content) if http: continue elif tag: pattern = re.compile('(.*?)<.*?>(.*?)</.*?>(.*)') items = re.findall(pattern, content) content_tags = '' if len(items) > 0: for item in items: if len(item) > 0: for item_s in item: content_tags = content_tags + item_s else: content_tags = content_tags + item_s content_tags = re.sub('<.*?>', '', content_tags) result.append(content_tags) else: continue else: result.append(content) return result def main(): url = 'http://daily.zhihu.com/' html = getHtml(url) urls = getUrl(html) for url in urls: try: getContent(url) except Exception,e: print e if __name__ == "__main__": main()