摘要:有什么不懂得麻烦去去转盘网找我,因为这个也是我开发的,上面会及时更新群号,这里不留号啥的,以免被系统给了。
因为要做观点,观点的屋子类似于知乎的话题,所以得想办法把他给爬下来,搞了半天最终还是妥妥的搞定了,代码是python写的,不懂得麻烦自学哈!懂得直接看代码,绝对可用
#coding:utf-8 """ @author:haoning @create time:2015.8.5 """ from __future__ import division # 精确除法 from Queue import Queue from __builtin__ import False import json import os import re import platform import uuid import urllib import urllib2 import sys import time import MySQLdb as mdb from bs4 import BeautifulSoup reload(sys) sys.setdefaultencoding( "utf-8" ) headers = { "User-Agent" : "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0", "Content-Type":"application/x-www-form-urlencoded; charset=UTF-8", "X-Requested-With":"XMLHttpRequest", "Referer":"https://www.zhihu.com/topics", "Cookie":"__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a" } DB_HOST = "127.0.0.1" DB_USER = "root" DB_PASS = "root" queue= Queue() #接收队列 nodeSet=set() keywordSet=set() stop=0 offset=-20 level=0 maxLevel=7 counter=0 base="" conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, "zhihu", charset="utf8") conn.autocommit(False) curr = conn.cursor() def get_html(url): try: req = urllib2.Request(url) response = urllib2.urlopen(req,None,3) #在这里应该加入代理 html = response.read() return html except: pass return None def getTopics(): url = "https://www.zhihu.com/topics" print url try: req = urllib2.Request(url) response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞� html = response.read().decode("utf-8") print html soup = BeautifulSoup(html) lis = soup.find_all("li", {"class" : "zm-topic-cat-item"}) for li in lis: data_id=li.get("data-id") name=li.text curr.execute("select id from classify_new where name=%s",(name)) y= curr.fetchone() if not y: curr.execute("INSERT INTO classify_new(data_id,name)VALUES(%s,%s)",(data_id,name)) conn.commit() except Exception as e: print "get topic error",e def get_extension(name): where=name.rfind(".") if where!=-1: return name[where:len(name)] return None def which_platform(): sys_str = platform.system() return sys_str def GetDateString(): when=time.strftime("%Y-%m-%d",time.localtime(time.time())) foldername = str(when) return foldername def makeDateFolder(par,classify): try: if os.path.isdir(par): newFolderName=par + "//" + GetDateString() + "//" +str(classify) if which_platform()=="Linux": newFolderName=par + "/" + GetDateString() + "/" +str(classify) if not os.path.isdir( newFolderName ): os.makedirs( newFolderName ) return newFolderName else: return None except Exception,e: print "kk",e return None def download_img(url,classify): try: extention=get_extension(url) if(extention is None): return None req = urllib2.Request(url) resp = urllib2.urlopen(req,None,3) dataimg=resp.read() name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention top="E://topic_pic" folder=makeDateFolder(top, classify) filename=None if folder is not None: filename =folder+"//"+name try: if "e82bab09c_m" in str(url): return True if not os.path.exists(filename): file_object = open(filename,"w+b") file_object.write(dataimg) file_object.close() return "/room/default/"+GetDateString()+"/"+str(classify)+"/"+name else: print "file exist" return None except IOError,e1: print "e1=",e1 pass except Exception as e: print "eee",e pass return None #如果没有下载下来就利用原来网站的链接 def getChildren(node,name): global queue,nodeSet try: url="https://www.zhihu.com/topic/"+str(node)+"/hot" html=get_html(url) if html is None: return soup = BeautifulSoup(html) p_ch="父话题" node_name=soup.find("div", {"id" : "zh-topic-title"}).find("h1").text topic_cla=soup.find("div", {"class" : "child-topic"}) if topic_cla is not None: try: p_ch=str(topic_cla.text) aList = soup.find_all("a", {"class" : "zm-item-tag"}) #获取所有子节点 if u"子话题" in p_ch: for a in aList: token=a.get("data-token") a=str(a).replace(" ","").replace(" ","").replace(" ","") start=str(a).find(">") end=str(a).rfind("") new_node=str(str(a)[start+1:end]) curr.execute("select id from rooms where name=%s",(new_node)) #先保证名字绝不相同 y= curr.fetchone() if not y: print "y=",y,"new_node=",new_node,"token=",token queue.put((token,new_node,node_name)) except Exception as e: print "add queue error",e except Exception as e: print "get html error",e def getContent(n,name,p,top_id): try: global counter curr.execute("select id from rooms where name=%s",(name)) #先保证名字绝不相同 y= curr.fetchone() print "exist?? ",y,"n=",n if not y: url="https://www.zhihu.com/topic/"+str(n)+"/hot" html=get_html(url) if html is None: return soup = BeautifulSoup(html) title=soup.find("div", {"id" : "zh-topic-title"}).find("h1").text pic_path=soup.find("a",{"id":"zh-avartar-edit-form"}).find("img").get("src") description=soup.find("div",{"class":"zm-editable-content"}) if description is not None: description=description.text if (u"未归类" in title or u"根话题" in title): #允许入库,避免死循环 description=None tag_path=download_img(pic_path,top_id) print "tag_path=",tag_path if (tag_path is not None) or tag_path==True: if tag_path==True: tag_path=None father_id=2 #默认为杂谈 curr.execute("select id from rooms where name=%s",(p)) results = curr.fetchall() for r in results: father_id=r[0] name=title curr.execute("select id from rooms where name=%s",(name)) #先保证名字绝不相同 y= curr.fetchone() print "store see..",y if not y: friends_num=0 temp = time.time() x = time.localtime(float(temp)) create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now create_time creater_id=None room_avatar=tag_path is_pass=1 has_index=0 reason_id=None #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id ######################有资格入库的内容 counter=counter+1 curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)) conn.commit() #必须时时进入数据库,不然找不到父节点 if counter % 200==0: print "current node",name,"num",counter except Exception as e: print "get content error",e def work(): global queue curr.execute("select id,node,parent,name from classify where status=1") results = curr.fetchall() for r in results: top_id=r[0] node=r[1] parent=r[2] name=r[3] try: queue.put((node,name,parent)) #首先放入队列 while queue.qsize() >0: n,p=queue.get() #顶节点出队 getContent(n,p,top_id) getChildren(n,name) #出队内容的子节点 conn.commit() except Exception as e: print "what"s wrong",e def new_work(): global queue curr.execute("select id,data_id,name from classify_new_copy where status=1") results = curr.fetchall() for r in results: top_id=r[0] data_id=r[1] name=r[2] try: get_topis(data_id,name,top_id) except: pass def get_topis(data_id,name,top_id): global queue url = "https://www.zhihu.com/node/TopicsPlazzaListV2" isGet = True; offset = -20; data_id=str(data_id) while isGet: offset = offset + 20 values = {"method": "next", "params": "{"topic_id":"+data_id+","offset":"+str(offset)+","hash_id":""}"} try: msg=None try: data = urllib.urlencode(values) request = urllib2.Request(url,data,headers) response = urllib2.urlopen(request,None,5) html=response.read().decode("utf-8") json_str = json.loads(html) ms=json_str["msg"] if len(ms) <5: break msg=ms[0] except Exception as e: print "eeeee",e #print msg if msg is not None: soup = BeautifulSoup(str(msg)) blks = soup.find_all("div", {"class" : "blk"}) for blk in blks: page=blk.find("a").get("href") if page is not None: node=page.replace("/topic/","") #将更多的种子入库 parent=name ne=blk.find("strong").text try: queue.put((node,ne,parent)) #首先放入队列 while queue.qsize() >0: n,name,p=queue.get() #顶节点出队 size=queue.qsize() if size > 0: print size getContent(n,name,p,top_id) getChildren(n,name) #出队内容的子节点 conn.commit() except Exception as e: print "what"s wrong",e except urllib2.URLError, e: print "error is",e pass if __name__ == "__main__": i=0 while i<400: new_work() i=i+1
说下数据库的问题,我这里就不传附件了,看字段自己建立,因为这确实太简单了,我是用的mysql,你看自己的需求自己建。
有什么不懂得麻烦去去转盘网找我,因为这个也是我开发的,上面会及时更新qq群号,这里不留qq号啥的,以免被系统给K了。
文章版权归作者所有,未经允许请勿转载,若此文章存在违规行为,您可以联系管理员删除。
转载请注明本文地址:https://www.ucloud.cn/yun/41199.html
摘要:话题精华即为知乎的高票回答。下面的项目中还包含了另外一个爬取的知乎的动态。 作者:William本文为原创文章,转载请注明作者及出处 Electron 可以让你使用纯 JavaScript 调用 Chrome 丰富的原生的接口来创造桌面应用。你可以把它看作一个专注于桌面应用的 Node.js 的变体,而不是 Web 服务器。其基于浏览器的应用方式可以极方便的做各种响应式的交互,接下来介...
摘要:我是一个知乎轻微重度用户,之前写了一只爬虫帮我爬取并分析它的数据,我感觉这个过程还是挺有意思,因为这是一个不断给自己创造问题又去解决问题的过程。所以这只爬虫还有登陆知乎搜索题目的功能。 我一直觉得,爬虫是许多web开发人员难以回避的点。我们也应该或多或少的去接触这方面,因为可以从爬虫中学习到web开发中应当掌握的一些基本知识。而且,它还很有趣。 我是一个知乎轻微重度用户,之前写了一只爬...
摘要:用将倒放这次让我们一个用做一个小工具将动态图片倒序播放发现引力波的机构使用的包美国科学家日宣布,他们去年月首次探测到引力波。宣布这一发现的,是激光干涉引力波天文台的负责人。这个机构诞生于上世纪年代,进行引力波观测已经有近年。 那些年我们写过的爬虫 从写 nodejs 的第一个爬虫开始陆陆续续写了好几个爬虫,从爬拉勾网上的职位信息到爬豆瓣上的租房帖子,再到去爬知乎上的妹子照片什么的,爬虫...
摘要:代码运行完成以后,微信被打开了。能不能像前面打开知乎一样,使用这个属性呢也行,也不行。滑动屏幕使用的命令为,滑动屏幕需要使用坐标信息。单独使用控制手机在 想开发网页爬虫,发现被反爬了?想对 App 抓包,发现数据被加密了?不要担心,使用 Airtest 开发 App 爬虫,只要人眼能看到,你就能抓到,最快只需要2分钟,兼容 Unity3D、Cocos2dx-*、Android 原生 A...
阅读 1188·2019-08-30 15:55
阅读 941·2019-08-30 15:55
阅读 2132·2019-08-30 15:44
阅读 2851·2019-08-29 14:17
阅读 1104·2019-08-29 12:45
阅读 3283·2019-08-26 10:48
阅读 3109·2019-08-23 18:18
阅读 2580·2019-08-23 16:47