摘要:上一篇写了个有意思的文章脚本撞库国内某榴账号很多朋友反映,该榴账号有验证,即时撞破账号也无卵用,其实新手号还是可以使用的,至于撞库破解某榴账号的问题请移到上篇帖子查看。华丽分割线这次再来研究下如何搞定某逼乎的话题问题。
上一篇写了个有意思的文章:python 脚本撞库国内“某榴”账号 https://www.52pojie.cn/thread...
很多朋友反映,该榴账号有google验证,即时撞破账号也无卵用,其实新手号还是可以使用的,至于撞库破解“某榴”账号的问题请移到上篇帖子查看。
这次再来研究下如何搞定“某逼乎”的话题问题。
逼乎现在在整个社区类网站中可以说火的不要不要的,逼乎上的内容质量在所有社区中还是相对较高的,很多时候我们都需要爬取逼乎精彩的话题,当然这不是为了装,搞不好你的设计恰好
就需要这么一个需求。
程序猿之间上代码,一起研究下:
"""
@author:haoning
[url=home.php?mod=space&uid=365491]@create[/url] time:2015.8.5
"""
from future import division # 精确除法
from Queue import Queue
from builtin import False
import json
import os
import re
import platform
import uuid
import urllib
import urllib2
import sys
import time
import MySQLdb as mdb
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding( "utf-8" )
headers = {
"User-Agent" : "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0",
"Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With":"XMLHttpRequest",
"Referer":"https://www.zhihu.com/topics",
"Cookie":"__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a"
}
DB_HOST = "127.0.0.1"
DB_USER = "root"
DB_PASS = "root"
queue= Queue() #接收队列
nodeSet=set()
keywordSet=set()
stop=0
offset=-20
level=0
maxLevel=7
counter=0
base=""
conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, "zhihu", charset="utf8")
conn.autocommit(False)
curr = conn.cursor()
def get_html(url):
try: req = urllib2.Request(url) response = urllib2.urlopen(req,None,3) #在这里应该加入代{过}{滤}理 html = response.read() return html except: pass return None
def getTopics():
url = "https://www.zhihu.com/topics" print url try: req = urllib2.Request(url) response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞� html = response.read().decode("utf-8") print html soup = BeautifulSoup(html) lis = soup.find_all("li", {"class" : "zm-topic-cat-item"}) for li in lis: data_id=li.get("data-id") name=li.text curr.execute("select id from classify_new where name=%s",(name)) y= curr.fetchone() if not y: curr.execute("INSERT INTO classify_new(data_id,name)VALUES(%s,%s)",(data_id,name)) conn.commit() except Exception as e: print "get topic error",e
def get_extension(name):
where=name.rfind(".") if where!=-1: return name[where:len(name)] return None
def which_platform():
sys_str = platform.system() return sys_str
def GetDateString():
when=time.strftime("%Y-%m-%d",time.localtime(time.time())) foldername = str(when) return foldername
def makeDateFolder(par,classify):
try: if os.path.isdir(par): newFolderName=par + "//" + GetDateString() + "//" +str(classify) if which_platform()=="Linux": newFolderName=par + "/" + GetDateString() + "/" +str(classify) if not os.path.isdir( newFolderName ): os.makedirs( newFolderName ) return newFolderName else: return None except Exception,e: print "kk",e return None
def download_img(url,classify):
try: extention=get_extension(url) if(extention is None): return None req = urllib2.Request(url) resp = urllib2.urlopen(req,None,3) dataimg=resp.read() name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention top="E://topic_pic" folder=makeDateFolder(top, classify) filename=None if folder is not None: filename =folder+"//"+name try: if "e82bab09c_m" in str(url): return True if not os.path.exists(filename): file_object = open(filename,"w+b") file_object.write(dataimg) file_object.close() return "/room/default/"+GetDateString()+"/"+str(classify)+"/"+name else: print "file exist" return None except IOError,e1: print "e1=",e1 pass except Exception as e: print "eee",e pass return None #如果没有下载下来就利用原来网站的链接
def getChildren(node,name):
global queue,nodeSet try: url="https://www.zhihu.com/topic/"+str(node)+"/hot" html=get_html(url) if html is None: return soup = BeautifulSoup(html) p_ch="父话题" node_name=soup.find("div", {"id" : "zh-topic-title"}).find("h1").text topic_cla=soup.find("div", {"class" : "child-topic"}) if topic_cla is not None: try: p_ch=str(topic_cla.text) aList = soup.find_all("a", {"class" : "zm-item-tag"}) #获取所有子节点 if u"子话题" in p_ch: for a in aList: token=a.get("data-token") a=str(a).replace(" ","").replace(" ","").replace(" ","") start=str(a).find(">") end=str(a).rfind("") new_node=str(str(a)[start+1:end]) curr.execute("select id from rooms where name=%s",(new_node)) #先保证名字绝不相同 y= curr.fetchone() if not y: print "y=",y,"new_node=",new_node,"token=",token queue.put((token,new_node,node_name)) except Exception as e: print "add queue error",e except Exception as e: print "get html error",e
def getContent(n,name,p,top_id):
try: global counter curr.execute("select id from rooms where name=%s",(name)) #先保证名字绝不相同 y= curr.fetchone() print "exist?? ",y,"n=",n if not y: url="https://www.zhihu.com/topic/"+str(n)+"/hot" html=get_html(url) if html is None: return soup = BeautifulSoup(html) title=soup.find("div", {"id" : "zh-topic-title"}).find("h1").text pic_path=soup.find("a",{"id":"zh-avartar-edit-form"}).find("img").get("src") description=soup.find("div",{"class":"zm-editable-content"}) if description is not None: description=description.text if (u"未归类" in title or u"根话题" in title): #允许入库,避免死循环 description=None tag_path=download_img(pic_path,top_id) print "tag_path=",tag_path if (tag_path is not None) or tag_path==True: if tag_path==True: tag_path=None father_id=2 #默认为杂谈 curr.execute("select id from rooms where name=%s",(p)) results = curr.fetchall() for r in results: father_id=r[0] name=title curr.execute("select id from rooms where name=%s",(name)) #先保证名字绝不相同 y= curr.fetchone() print "store see..",y if not y: friends_num=0 temp = time.time() x = time.localtime(float(temp)) create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now create_time creater_id=None room_avatar=tag_path is_pass=1 has_index=0 reason_id=None #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id ######################有资格入库的内容 counter=counter+1 curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)) conn.commit() #必须时时进入数据库,不然找不到父节点 if counter % 200==0: print "current node",name,"num",counter except Exception as e: print "get content error",e
def work():
global queue curr.execute("select id,node,parent,name from classify where status=1") results = curr.fetchall() for r in results: top_id=r[0] node=r[1] parent=r[2] name=r[3] try: queue.put((node,name,parent)) #首先放入队列 while queue.qsize() >0: n,p=queue.get() #顶节点出队 getContent(n,p,top_id) getChildren(n,name) #出队内容的子节点 conn.commit() except Exception as e: print "what"s wrong",e
def new_work():
global queue curr.execute("select id,data_id,name from classify_new_copy where status=1") results = curr.fetchall() for r in results: top_id=r[0] data_id=r[1] name=r[2] try: get_topis(data_id,name,top_id) except: pass
def get_topis(data_id,name,top_id):
global queue url = "https://www.zhihu.com/node/TopicsPlazzaListV2" isGet = True; offset = -20; data_id=str(data_id) while isGet: offset = offset + 20 values = {"method": "next", "params": "{"topic_id":"+data_id+","offset":"+str(offset)+","hash_id":""}"} try: msg=None try: data = urllib.urlencode(values) request = urllib2.Request(url,data,headers) response = urllib2.urlopen(request,None,5) html=response.read().decode("utf-8") json_str = json.loads(html) ms=json_str["msg"] if len(ms) <5: break msg=ms[0] except Exception as e: print "eeeee",e #print msg if msg is not None: soup = BeautifulSoup(str(msg)) blks = soup.find_all("div", {"class" : "blk"}) for blk in blks: page=blk.find("a").get("href") if page is not None: node=page.replace("/topic/","") #将更多的种子入库 parent=name ne=blk.find("strong").text try: queue.put((node,ne,parent)) #首先放入队列 while queue.qsize() >0: n,name,p=queue.get() #顶节点出队 size=queue.qsize() if size > 0: print size getContent(n,name,p,top_id) getChildren(n,name) #出队内容的子节点 conn.commit() except Exception as e: print "what"s wrong",e except urllib2.URLError, e: print "error is",e pass
if name == "__main__":
i=0 while i<400: new_work() i=i+1
当然代码是十分简单的,稍微有python基础都可以搞定,注释清楚明白,大家安静讨论研究下,献丑了。
文章版权归作者所有,未经允许请勿转载,若此文章存在违规行为,您可以联系管理员删除。
转载请注明本文地址:https://www.ucloud.cn/yun/42793.html
摘要:这里只讨论技术本生,代码中某榴的地址也已经改掉,避免被管理员误解禁言等发生,谢谢大家理解。 其实日常生活中我们的用户名和密码就那么几个,所以这给撞库带来了可能,本文主要给出python脚本撞库的一点粗浅代码。这里只讨论技术本生,代码中某榴的地址也已经改掉,避免被管理员误解禁言等发生,谢谢大家理解。 代码如下: import sysreload(sys) sys.setdefaulten...
摘要:既然这不是宗教,而是关于如何面对新的事物,我认为我们应该列出所有其他人认为不使用来做开发的理由。在下工作的不好这是一定的。流行度只是衡量使用率,社区活跃度的一个指标,用来帮助人们判断技术的可用性,稳定性和支持程度。不幸的是,人们混淆了和。这是一篇赞美 Ruby 的文章!!!看完再喷不迟 请注意:这是一篇主观意识的文章。它的目的并不是要说服你使用或者不使用Ruby,或者其他任何技术。这...
摘要:用户社区的使用用户社区为实名制社区,在提问回复或发文评论前必须绑定手机号才能够正常发帖。官方有权对灌水违法违规不文明内容进行删除。亲爱的小伙伴你好!首先感谢你来到UCoud用户社区,期待你的加入!UClub用户社区旨在为UCloud用户及广大云计算爱好者提供一个开放的学习交流平台。为了你能更好地使用UCloud用户社区,请你花费3分钟仔细阅读,阅读完成后将获得10积分。UCloud用户社区的...
摘要:而在年的新版本中,对新建项目的配置增加了一点小功能。点击就是新建一个项目。在创建同时,还需要指定项目所使用的环境。但对于新手来说,就会发生,在命令行里通过安装的库,无法在自己创建的项目中使用。 showImg(https://segmentfault.com/img/remote/1460000017038452?w=600&h=338); https://www.zhihu.com...
摘要:去吧,参加一个在上正在举办的实时比赛吧试试你所学到的全部知识微软雅黑深度学习终于看到这个,兴奋吧现在,你已经学到了绝大多数关于机器学习的技术,是时候试试深度学习了。微软雅黑对于深度学习,我也是个新手,就请把这些建议当作参考吧。 如果你想做一个数据科学家,或者作为一个数据科学家你想扩展自己的工具和知识库,那么,你来对地方了。这篇文章的目的,是给刚开始使用Python进行数据分析的人,指明一条全...
阅读 2294·2021-09-22 15:27
阅读 3165·2021-09-03 10:32
阅读 3490·2021-09-01 11:38
阅读 2492·2019-08-30 15:56
阅读 2206·2019-08-30 13:01
阅读 1531·2019-08-29 12:13
阅读 1409·2019-08-26 13:33
阅读 884·2019-08-26 13:30