资讯专栏INFORMATION COLUMN

python爬虫如何爬知乎的话题?

SimpleTriangle / 2744人阅读

摘要:有什么不懂得麻烦去去转盘网找我,因为这个也是我开发的,上面会及时更新群号,这里不留号啥的,以免被系统给了。

因为要做观点,观点的屋子类似于知乎的话题,所以得想办法把他给爬下来,搞了半天最终还是妥妥的搞定了,代码是python写的,不懂得麻烦自学哈!懂得直接看代码,绝对可用

#coding:utf-8
"""
@author:haoning
@create time:2015.8.5
"""
from __future__ import division  # 精确除法
from Queue import Queue
from __builtin__ import False
import json
import os
import re
import platform
import uuid
import urllib
import urllib2
import sys
import time
import MySQLdb as mdb
from bs4 import BeautifulSoup

reload(sys)
sys.setdefaultencoding( "utf-8" )

headers = {
   "User-Agent" : "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0",
   "Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
   "X-Requested-With":"XMLHttpRequest",
   "Referer":"https://www.zhihu.com/topics",
   "Cookie":"__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a"
}

DB_HOST = "127.0.0.1"
DB_USER = "root"
DB_PASS = "root"

queue= Queue() #接收队列
nodeSet=set()
keywordSet=set()
stop=0
offset=-20
level=0
maxLevel=7
counter=0
base=""

conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, "zhihu", charset="utf8")
conn.autocommit(False)
curr = conn.cursor()

def get_html(url):
    try:
        req = urllib2.Request(url)
        response = urllib2.urlopen(req,None,3) #在这里应该加入代理
        html = response.read()
        return html
    except:
        pass
    return None

def getTopics():
    url = "https://www.zhihu.com/topics"
    print url
    try:
        req = urllib2.Request(url)
        response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞�
        html = response.read().decode("utf-8")
        print html
        soup = BeautifulSoup(html)
        lis = soup.find_all("li", {"class" : "zm-topic-cat-item"})
        
        for li in lis:
            data_id=li.get("data-id")
            name=li.text
            curr.execute("select id from classify_new where name=%s",(name))
            y= curr.fetchone()
            if not y:
                curr.execute("INSERT INTO classify_new(data_id,name)VALUES(%s,%s)",(data_id,name))
        conn.commit()
    except Exception as e:
        print "get topic error",e
        

def get_extension(name):  
    where=name.rfind(".")
    if where!=-1:
        return name[where:len(name)]
    return None


def which_platform():
    sys_str = platform.system()
    return sys_str

def GetDateString():
    when=time.strftime("%Y-%m-%d",time.localtime(time.time()))
    foldername = str(when)
    return foldername 

def makeDateFolder(par,classify):
    try:
        if os.path.isdir(par):
            newFolderName=par + "//" + GetDateString() + "//"  +str(classify)
            if which_platform()=="Linux":
                newFolderName=par + "/" + GetDateString() + "/" +str(classify)
            if not os.path.isdir( newFolderName ):
                os.makedirs( newFolderName )
            return newFolderName
        else:
            return None 
    except Exception,e:
        print "kk",e
    return None 

def download_img(url,classify):
    try:
        extention=get_extension(url)
        if(extention is None):
            return None
        req = urllib2.Request(url)
        resp = urllib2.urlopen(req,None,3)
        dataimg=resp.read()
        name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention
        top="E://topic_pic"
        folder=makeDateFolder(top, classify)
        filename=None
        if folder is not None:
            filename  =folder+"//"+name
        try:
            if "e82bab09c_m" in str(url):
                return True
            if not os.path.exists(filename):
                file_object = open(filename,"w+b")
                file_object.write(dataimg)
                file_object.close()
                return "/room/default/"+GetDateString()+"/"+str(classify)+"/"+name
            else:
                print "file exist"
                return None
        except IOError,e1:
            print "e1=",e1
            pass
    except Exception as e:
        print "eee",e
        pass
    return None #如果没有下载下来就利用原来网站的链接

def getChildren(node,name):
    global queue,nodeSet
    try:
        url="https://www.zhihu.com/topic/"+str(node)+"/hot"
        html=get_html(url)
        if html is None:
            return
        soup = BeautifulSoup(html)
        p_ch="父话题"
        node_name=soup.find("div", {"id" : "zh-topic-title"}).find("h1").text
        topic_cla=soup.find("div", {"class" : "child-topic"})
        if topic_cla is not None:
            try:
                p_ch=str(topic_cla.text)
                aList = soup.find_all("a", {"class" : "zm-item-tag"}) #获取所有子节点
                if u"子话题" in p_ch:
                    for a in aList:
                        token=a.get("data-token")
                        a=str(a).replace("
","").replace("	","").replace("
","")
                        start=str(a).find(">")
                        end=str(a).rfind("")
                        new_node=str(str(a)[start+1:end])
                        curr.execute("select id from rooms where name=%s",(new_node)) #先保证名字绝不相同
                        y= curr.fetchone()
                        if not y:
                            print "y=",y,"new_node=",new_node,"token=",token
                            queue.put((token,new_node,node_name))
            except Exception as e:
                print "add queue error",e
    except Exception as e:
        print "get html error",e
        
    

def getContent(n,name,p,top_id):
    try:
        global counter
        curr.execute("select id from rooms where name=%s",(name)) #先保证名字绝不相同
        y= curr.fetchone()
        print "exist?? ",y,"n=",n
        if not y:
            url="https://www.zhihu.com/topic/"+str(n)+"/hot"
            html=get_html(url)
            if html is None:
                return
            soup = BeautifulSoup(html)
            title=soup.find("div", {"id" : "zh-topic-title"}).find("h1").text
            pic_path=soup.find("a",{"id":"zh-avartar-edit-form"}).find("img").get("src")
            description=soup.find("div",{"class":"zm-editable-content"})
            if description is not None:
                description=description.text
                
            if (u"未归类" in title or u"根话题" in title): #允许入库,避免死循环
                description=None
                
            tag_path=download_img(pic_path,top_id)
            print "tag_path=",tag_path
            if (tag_path is not None) or tag_path==True:
                if tag_path==True:
                    tag_path=None
                father_id=2 #默认为杂谈
                curr.execute("select id from rooms where name=%s",(p))
                results = curr.fetchall()
                for r in results:
                    father_id=r[0]
                name=title
                curr.execute("select id from rooms where name=%s",(name)) #先保证名字绝不相同
                y= curr.fetchone()
                print "store see..",y
                if not y:
                    friends_num=0
                    temp = time.time()
                    x = time.localtime(float(temp))
                    create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now
                    create_time
                    creater_id=None
                    room_avatar=tag_path
                    is_pass=1
                    has_index=0
                    reason_id=None  
                    #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id
                    ######################有资格入库的内容
                    counter=counter+1
                    curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))
                    conn.commit() #必须时时进入数据库,不然找不到父节点
                    if counter % 200==0:
                        print "current node",name,"num",counter
    except Exception as e:
        print "get content error",e       

def work():
    global queue
    curr.execute("select id,node,parent,name from classify where status=1")
    results = curr.fetchall()
    for r in results:
        top_id=r[0]
        node=r[1]
        parent=r[2]
        name=r[3]
        try:
            queue.put((node,name,parent)) #首先放入队列
            while queue.qsize() >0:
                n,p=queue.get() #顶节点出队
                getContent(n,p,top_id)
                getChildren(n,name) #出队内容的子节点
            conn.commit()
        except Exception as e:
            print "what"s wrong",e  
            
def new_work():
    global queue
    curr.execute("select id,data_id,name from classify_new_copy where status=1")
    results = curr.fetchall()
    for r in results:
        top_id=r[0]
        data_id=r[1]
        name=r[2]
        try:
            get_topis(data_id,name,top_id)
        except:
            pass


def get_topis(data_id,name,top_id):
    global queue
    url = "https://www.zhihu.com/node/TopicsPlazzaListV2"
    isGet = True;
    offset = -20;
    data_id=str(data_id)
    while isGet:
        offset = offset + 20
        values = {"method": "next", "params": "{"topic_id":"+data_id+","offset":"+str(offset)+","hash_id":""}"}
        try:
            msg=None
            try:
                data = urllib.urlencode(values)
                request = urllib2.Request(url,data,headers)
                response = urllib2.urlopen(request,None,5)
                html=response.read().decode("utf-8")
                json_str = json.loads(html)
                ms=json_str["msg"]
                if len(ms) <5:
                    break
                msg=ms[0]
            except Exception as e:
                print "eeeee",e
            #print msg
            if msg is not None:
                soup = BeautifulSoup(str(msg))
                blks = soup.find_all("div", {"class" : "blk"})
                for blk in blks:
                    page=blk.find("a").get("href")
                    if page is not None:
                        node=page.replace("/topic/","") #将更多的种子入库
                        parent=name
                        ne=blk.find("strong").text
                        try:
                            queue.put((node,ne,parent)) #首先放入队列
                            while queue.qsize() >0:
                                n,name,p=queue.get() #顶节点出队
                                size=queue.qsize()
                                if size > 0:
                                    print size
                                getContent(n,name,p,top_id)
                                getChildren(n,name) #出队内容的子节点
                            conn.commit()
                        except Exception as e:
                            print "what"s wrong",e  
        except urllib2.URLError, e:
            print "error is",e
            pass 
            
        
if __name__ == "__main__":
    i=0
    while i<400:
        new_work()
        i=i+1

说下数据库的问题,我这里就不传附件了,看字段自己建立,因为这确实太简单了,我是用的mysql,你看自己的需求自己建。

有什么不懂得麻烦去去转盘网找我,因为这个也是我开发的,上面会及时更新qq群号,这里不留qq号啥的,以免被系统给K了。

文章版权归作者所有,未经允许请勿转载,若此文章存在违规行为,您可以联系管理员删除。

转载请注明本文地址:https://www.ucloud.cn/yun/41199.html

相关文章

  • 基于 Electron 的爬虫框架 Nightmare

    摘要:话题精华即为知乎的高票回答。下面的项目中还包含了另外一个爬取的知乎的动态。 作者:William本文为原创文章,转载请注明作者及出处 Electron 可以让你使用纯 JavaScript 调用 Chrome 丰富的原生的接口来创造桌面应用。你可以把它看作一个专注于桌面应用的 Node.js 的变体,而不是 Web 服务器。其基于浏览器的应用方式可以极方便的做各种响应式的交互,接下来介...

    Harriet666 评论0 收藏0
  • 一只node爬虫的升级打怪之路

    摘要:我是一个知乎轻微重度用户,之前写了一只爬虫帮我爬取并分析它的数据,我感觉这个过程还是挺有意思,因为这是一个不断给自己创造问题又去解决问题的过程。所以这只爬虫还有登陆知乎搜索题目的功能。 我一直觉得,爬虫是许多web开发人员难以回避的点。我们也应该或多或少的去接触这方面,因为可以从爬虫中学习到web开发中应当掌握的一些基本知识。而且,它还很有趣。 我是一个知乎轻微重度用户,之前写了一只爬...

    shiweifu 评论0 收藏0
  • Evil Python

    摘要:用将倒放这次让我们一个用做一个小工具将动态图片倒序播放发现引力波的机构使用的包美国科学家日宣布,他们去年月首次探测到引力波。宣布这一发现的,是激光干涉引力波天文台的负责人。这个机构诞生于上世纪年代,进行引力波观测已经有近年。 那些年我们写过的爬虫 从写 nodejs 的第一个爬虫开始陆陆续续写了好几个爬虫,从爬拉勾网上的职位信息到爬豆瓣上的租房帖子,再到去爬知乎上的妹子照片什么的,爬虫...

    Turbo 评论0 收藏0
  • 网站信息采集

    摘要:网站信息采集在编写爬虫之前可能需要先了解和搜集网站信息协议也称为爬虫协议机器人协议等的全称是网络爬虫排除标准,网站通过协议告诉搜索引擎哪些页面可以抓取,哪些页面不能抓取。 网站信息采集 在编写爬虫之前可能需要先了解和搜集网站信息 robots.txt Robots协议(也称为爬虫协议、机器人协议等)的全称是网络爬虫排除标准(Robots Exclusion Protocol),网站通过...

    AZmake 评论0 收藏0
  • 全面超越Appium,使用Airtest超快速开发App爬虫

    摘要:代码运行完成以后,微信被打开了。能不能像前面打开知乎一样,使用这个属性呢也行,也不行。滑动屏幕使用的命令为,滑动屏幕需要使用坐标信息。单独使用控制手机在 想开发网页爬虫,发现被反爬了?想对 App 抓包,发现数据被加密了?不要担心,使用 Airtest 开发 App 爬虫,只要人眼能看到,你就能抓到,最快只需要2分钟,兼容 Unity3D、Cocos2dx-*、Android 原生 A...

    noONE 评论0 收藏0

发表评论

0条评论

SimpleTriangle

|高级讲师

TA的文章

阅读更多
最新活动
阅读需要支付1元查看
<