摘要:之前打算做个微信小程序的社区,所以写了爬虫去爬取微信小程序,后面发现做微信小程序没有前途,就把原来的项目废弃了做了现在的网站观点不过代码放着也是放着,还不如公开让大家用,所以我把代码贴出来,有需要的复制了使用就是了。
之前打算做个微信小程序的社区,所以写了爬虫去爬取微信小程序,后面发现做微信小程序没有前途,就把原来的项目废弃了做了现在的网站观点,不过代码放着也是放着,还不如公开让大家用,所以我把代码贴出来,有需要的复制了使用就是了。
#coding:utf-8 __author__ = "haoning" #!/usr/bin/env python import time import urllib2 import datetime import requests import json import random import sys import platform import uuid reload(sys) sys.setdefaultencoding( "utf-8" ) import re import os import MySQLdb as mdb from bs4 import BeautifulSoup DB_HOST = "127.0.0.1" DB_USER = "root" DB_PASS = "root" #init database conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, "pybbs-springboot", charset="utf8") conn.autocommit(False) curr = conn.cursor() count=0 how_many=0 base_url="http://www.wechat-cloud.com" url=base_url+"/index.php?s=/home/article/ajax_get_list.html&category_id={category_id}&page={page}&size={size}" user_agents = [ "Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11", "Opera/9.25 (Windows NT 5.1; U; en)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12", "Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9", "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ", ] def fake_header(): agent=random.choice(user_agents) cookie="PHPSESSID=p5mokvec7ct1gqe9efcnth9d44; Hm_lvt_c364957e96174b029f292041f7d822b7=1487492811,1487556626; Hm_lpvt_c364957e96174b029f292041f7d822b7=1487564069" req_header = { "Accept":"application/json, text/javascript, */*; q=0.01", #"Accept-Encoding":"gzip, deflate, sdch", "Accept-Language":"zh-CN,zh;q=0.8", "Cache-Control":"max-age=0", "Connection":"keep-alive", "Host":"www.wechat-cloud.com", #"Cookie":cookie, "Referer":"http://www.wechat-cloud.com/index.php?s=/home/index/index.html", "Upgrade-Insecure-Requests":"1", "User-Agent":agent, "X-Requested-With":"XMLHttpRequest", } return req_header def gethtml(url): try: header=fake_header() req = urllib2.Request(url,headers=header) response = urllib2.urlopen(req, None,15) html = response.read() return html except Exception as e: print "e",e return None def get_img_data(url): try: #添加头信息,模仿浏览器抓取网页,对付返回403禁止访问的问题 req = urllib2.Request(url) response = urllib2.urlopen(req, None,15) dataimg = response.read() return dataimg except Exception as e: print "image data",e return None def makeDateFolder(par,classify): try: if os.path.isdir(par): newFolderName=par + "//" + str(classify)+ "//" +GetDateString() if not os.path.isdir( newFolderName ): os.makedirs( newFolderName ) return newFolderName else: return par except Exception,e: print "kk",e return par def map_folder(what): return what def GetDateString(): when=time.strftime("%Y-%m-%d",time.localtime(time.time())) foldername = str(when) return foldername def get_extension(name): where=name.rfind(".") if where!=-1: return name[where:len(name)] return "#" def download_img(url,what): try: #print url extention=get_extension(url) dataimg=get_img_data(url) name=str(uuid.uuid1()).replace("-","")+"-www.weixinapphome.com" #print "name",name classfiy_folder=map_folder(what) top="E://wxapp_store" filename =makeDateFolder(top,classfiy_folder)+"//"+name+extention try: if not os.path.exists(filename): file_object = open(filename,"w+b") file_object.write(dataimg) file_object.close() return classfiy_folder+"/"+GetDateString()+"/"+name+extention else: print "file exist" return None except IOError,e1: print "e1=",e1 #pass return None #如果没有下载下来就利用原来网站的链接 except Exception,e: print "problem",e pass return None def work(): page=0 global how_many while 1: try: page=page+1 begin_url=url.format(category_id=0, page=page,size=12).encode("utf-8") html=gethtml(begin_url) if html is not None: #print html json_results=json.loads(html) is_end=json_results["isEnd"] if str(is_end)=="True": break results=json_results["list"] for result in results: href=result["href"] detail_url=base_url+href #print detail_url detail_html=gethtml(detail_url) if detail_html is not None: soup = BeautifulSoup(detail_html) icon_url=base_url+soup.find("div",{"class":"icon fl"}).find("img").get("src") name=soup.find("div",{"class":"cont fl"}).find("h2").text classify=soup.find("div",{"class":"tab"}).find("span").text classify=str(classify).replace("分类: ","") #print classify barcode_path=base_url+soup.find("div",{"id":"install-code"}).find("img").get("src") view_num=soup.find("span",{"class":"views"}).text #view_num=filter(str.isalnum,str(view_num)) pic_path=base_url+soup.find("div",{"class":"img-box"}).find("img").get("src") temp = time.time() x = time.localtime(float(temp)) acq_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now curr.execute("select id from pybbs_wxapp_store where `from`=%s",(detail_url)) y= curr.fetchone() if not y: y1=download_img(icon_url,"icon") y2=download_img(barcode_path,"barcode") y3=download_img(pic_path,"pic") if (y1 is not None) and (y2 is not None) and (y3 is not None): name=name author=None classify=classify describe=None view_num=view_num #print view_num logo=y1 _from=detail_url barcode=y2 acq_time=acq_time hot_weight=-9999 pic_uuid=str(uuid.uuid1()).replace("-","") pic_path=y3 #print name,author,classify,describe,view_num,logo,_from,barcode,acq_time,hot_weight,pic_uuid curr.execute("INSERT INTO pybbs_wxapp_store(name,author,classify,`describe`,view_num,logo,`from`,barcode,acq_time,hot_weight,pic_path)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(name,author,classify,describe,view_num,logo,_from,barcode,acq_time,hot_weight,pic_path)) curr.execute("select id from pybbs_wxapp_classify where `classify_name`=%s",(classify)) yx= curr.fetchone() if not yx: describe=None temp = time.time() x = time.localtime(float(temp)) record_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now curr.execute("INSERT INTO pybbs_wxapp_classify(classify_name,`describe`,record_time)VALUES(%s,%s,%s)",(classify,describe,record_time)) how_many+=1 print "new comer:",pic_uuid,">>",how_many if how_many % 10==0: conn.commit() conn.commit() except Exception as e: print "while error",e if __name__ == "__main__": i=3 while i>0: work() i=i-1
其中有些参数请改成自己的,比如说数据库密码了,图片存储到哪个盘,数据库表格自己建立,因为这些实在太简单了,所以没啥可以唠叨的。
文章版权归作者所有,未经允许请勿转载,若此文章存在违规行为,您可以联系管理员删除。
转载请注明本文地址:https://www.ucloud.cn/yun/41197.html
摘要:时间永远都过得那么快,一晃从年注册,到现在已经过去了年那些被我藏在收藏夹吃灰的文章,已经太多了,是时候把他们整理一下了。那是因为收藏夹太乱,橡皮擦给设置私密了,不收拾不好看呀。 ...
摘要:爬虫目标是获取用户的微博数关注数粉丝数。创建数据这部分我只需要个人信息,微博数,关注数分数数这些基本信息就行。 前言 Scrapy学习(三) 爬取豆瓣图书信息 接上篇之后。这次来爬取需要登录才能访问的微博。爬虫目标是获取用户的微博数、关注数、粉丝数。为建立用户关系图(尚未实现)做数据储备 准备 安装第三方库requests和pymongo 安装MongoDB 创建一个weibo爬虫项...
摘要:本人长期出售超大量微博数据旅游网站评论数据,并提供各种指定数据爬取服务,。如果用户传入伪造的,则新浪微博会返回一个错误。 PS:(本人长期出售超大量微博数据、旅游网站评论数据,并提供各种指定数据爬取服务,Message to YuboonaZhang@Yahoo.com。由于微博接口更新后限制增大,这个代码已经不能用来爬数据了。如果只是为了收集数据可以咨询我的邮箱,如果是为了学习爬虫,...
摘要:本人长期出售超大量微博数据旅游网站评论数据,并提供各种指定数据爬取服务,。如果用户传入伪造的,则新浪微博会返回一个错误。 PS:(本人长期出售超大量微博数据、旅游网站评论数据,并提供各种指定数据爬取服务,Message to YuboonaZhang@Yahoo.com。由于微博接口更新后限制增大,这个代码已经不能用来爬数据了。如果只是为了收集数据可以咨询我的邮箱,如果是为了学习爬虫,...
阅读 4162·2021-11-22 13:52
阅读 2080·2021-09-22 15:12
阅读 1123·2019-08-30 15:53
阅读 3456·2019-08-29 17:12
阅读 2193·2019-08-29 16:23
阅读 1650·2019-08-26 13:56
阅读 1772·2019-08-26 13:44
阅读 1885·2019-08-26 11:56