摘要:之前打算做个微信小程序的社区,所以写了爬虫去爬取微信小程序,后面发现做微信小程序没有前途,就把原来的项目废弃了做了现在的网站观点不过代码放着也是放着,还不如公开让大家用,所以我把代码贴出来,有需要的复制了使用就是了。
之前打算做个微信小程序的社区,所以写了爬虫去爬取微信小程序,后面发现做微信小程序没有前途,就把原来的项目废弃了做了现在的网站观点,不过代码放着也是放着,还不如公开让大家用,所以我把代码贴出来,有需要的复制了使用就是了。
#coding:utf-8
__author__ = "haoning"
#!/usr/bin/env python
import time
import urllib2
import datetime
import requests
import json
import random
import sys
import platform
import uuid
reload(sys)
sys.setdefaultencoding( "utf-8" )
import re
import os
import MySQLdb as mdb
from bs4 import BeautifulSoup
DB_HOST = "127.0.0.1"
DB_USER = "root"
DB_PASS = "root"
#init database
conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, "pybbs-springboot", charset="utf8")
conn.autocommit(False)
curr = conn.cursor()
count=0
how_many=0
base_url="http://www.wechat-cloud.com"
url=base_url+"/index.php?s=/home/article/ajax_get_list.html&category_id={category_id}&page={page}&size={size}"
user_agents = [
"Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11",
"Opera/9.25 (Windows NT 5.1; U; en)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12",
"Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
]
def fake_header():
agent=random.choice(user_agents)
cookie="PHPSESSID=p5mokvec7ct1gqe9efcnth9d44; Hm_lvt_c364957e96174b029f292041f7d822b7=1487492811,1487556626; Hm_lpvt_c364957e96174b029f292041f7d822b7=1487564069"
req_header = {
"Accept":"application/json, text/javascript, */*; q=0.01",
#"Accept-Encoding":"gzip, deflate, sdch",
"Accept-Language":"zh-CN,zh;q=0.8",
"Cache-Control":"max-age=0",
"Connection":"keep-alive",
"Host":"www.wechat-cloud.com",
#"Cookie":cookie,
"Referer":"http://www.wechat-cloud.com/index.php?s=/home/index/index.html",
"Upgrade-Insecure-Requests":"1",
"User-Agent":agent,
"X-Requested-With":"XMLHttpRequest",
}
return req_header
def gethtml(url):
try:
header=fake_header()
req = urllib2.Request(url,headers=header)
response = urllib2.urlopen(req, None,15)
html = response.read()
return html
except Exception as e:
print "e",e
return None
def get_img_data(url):
try:
#添加头信息,模仿浏览器抓取网页,对付返回403禁止访问的问题
req = urllib2.Request(url)
response = urllib2.urlopen(req, None,15)
dataimg = response.read()
return dataimg
except Exception as e:
print "image data",e
return None
def makeDateFolder(par,classify):
try:
if os.path.isdir(par):
newFolderName=par + "//" + str(classify)+ "//" +GetDateString()
if not os.path.isdir( newFolderName ):
os.makedirs( newFolderName )
return newFolderName
else:
return par
except Exception,e:
print "kk",e
return par
def map_folder(what):
return what
def GetDateString():
when=time.strftime("%Y-%m-%d",time.localtime(time.time()))
foldername = str(when)
return foldername
def get_extension(name):
where=name.rfind(".")
if where!=-1:
return name[where:len(name)]
return "#"
def download_img(url,what):
try:
#print url
extention=get_extension(url)
dataimg=get_img_data(url)
name=str(uuid.uuid1()).replace("-","")+"-www.weixinapphome.com"
#print "name",name
classfiy_folder=map_folder(what)
top="E://wxapp_store"
filename =makeDateFolder(top,classfiy_folder)+"//"+name+extention
try:
if not os.path.exists(filename):
file_object = open(filename,"w+b")
file_object.write(dataimg)
file_object.close()
return classfiy_folder+"/"+GetDateString()+"/"+name+extention
else:
print "file exist"
return None
except IOError,e1:
print "e1=",e1
#pass
return None #如果没有下载下来就利用原来网站的链接
except Exception,e:
print "problem",e
pass
return None
def work():
page=0
global how_many
while 1:
try:
page=page+1
begin_url=url.format(category_id=0, page=page,size=12).encode("utf-8")
html=gethtml(begin_url)
if html is not None:
#print html
json_results=json.loads(html)
is_end=json_results["isEnd"]
if str(is_end)=="True":
break
results=json_results["list"]
for result in results:
href=result["href"]
detail_url=base_url+href
#print detail_url
detail_html=gethtml(detail_url)
if detail_html is not None:
soup = BeautifulSoup(detail_html)
icon_url=base_url+soup.find("div",{"class":"icon fl"}).find("img").get("src")
name=soup.find("div",{"class":"cont fl"}).find("h2").text
classify=soup.find("div",{"class":"tab"}).find("span").text
classify=str(classify).replace("分类: ","")
#print classify
barcode_path=base_url+soup.find("div",{"id":"install-code"}).find("img").get("src")
view_num=soup.find("span",{"class":"views"}).text
#view_num=filter(str.isalnum,str(view_num))
pic_path=base_url+soup.find("div",{"class":"img-box"}).find("img").get("src")
temp = time.time()
x = time.localtime(float(temp))
acq_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now
curr.execute("select id from pybbs_wxapp_store where `from`=%s",(detail_url))
y= curr.fetchone()
if not y:
y1=download_img(icon_url,"icon")
y2=download_img(barcode_path,"barcode")
y3=download_img(pic_path,"pic")
if (y1 is not None) and (y2 is not None) and (y3 is not None):
name=name
author=None
classify=classify
describe=None
view_num=view_num
#print view_num
logo=y1
_from=detail_url
barcode=y2
acq_time=acq_time
hot_weight=-9999
pic_uuid=str(uuid.uuid1()).replace("-","")
pic_path=y3
#print name,author,classify,describe,view_num,logo,_from,barcode,acq_time,hot_weight,pic_uuid
curr.execute("INSERT INTO pybbs_wxapp_store(name,author,classify,`describe`,view_num,logo,`from`,barcode,acq_time,hot_weight,pic_path)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(name,author,classify,describe,view_num,logo,_from,barcode,acq_time,hot_weight,pic_path))
curr.execute("select id from pybbs_wxapp_classify where `classify_name`=%s",(classify))
yx= curr.fetchone()
if not yx:
describe=None
temp = time.time()
x = time.localtime(float(temp))
record_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now
curr.execute("INSERT INTO pybbs_wxapp_classify(classify_name,`describe`,record_time)VALUES(%s,%s,%s)",(classify,describe,record_time))
how_many+=1
print "new comer:",pic_uuid,">>",how_many
if how_many % 10==0:
conn.commit()
conn.commit()
except Exception as e:
print "while error",e
if __name__ == "__main__":
i=3
while i>0:
work()
i=i-1
其中有些参数请改成自己的,比如说数据库密码了,图片存储到哪个盘,数据库表格自己建立,因为这些实在太简单了,所以没啥可以唠叨的。
文章版权归作者所有,未经允许请勿转载,若此文章存在违规行为,您可以联系管理员删除。
转载请注明本文地址:https://www.ucloud.cn/yun/41197.html
摘要:时间永远都过得那么快,一晃从年注册,到现在已经过去了年那些被我藏在收藏夹吃灰的文章,已经太多了,是时候把他们整理一下了。那是因为收藏夹太乱,橡皮擦给设置私密了,不收拾不好看呀。 ...
摘要:爬虫目标是获取用户的微博数关注数粉丝数。创建数据这部分我只需要个人信息,微博数,关注数分数数这些基本信息就行。 前言 Scrapy学习(三) 爬取豆瓣图书信息 接上篇之后。这次来爬取需要登录才能访问的微博。爬虫目标是获取用户的微博数、关注数、粉丝数。为建立用户关系图(尚未实现)做数据储备 准备 安装第三方库requests和pymongo 安装MongoDB 创建一个weibo爬虫项...
摘要:本人长期出售超大量微博数据旅游网站评论数据,并提供各种指定数据爬取服务,。如果用户传入伪造的,则新浪微博会返回一个错误。 PS:(本人长期出售超大量微博数据、旅游网站评论数据,并提供各种指定数据爬取服务,Message to YuboonaZhang@Yahoo.com。由于微博接口更新后限制增大,这个代码已经不能用来爬数据了。如果只是为了收集数据可以咨询我的邮箱,如果是为了学习爬虫,...
摘要:本人长期出售超大量微博数据旅游网站评论数据,并提供各种指定数据爬取服务,。如果用户传入伪造的,则新浪微博会返回一个错误。 PS:(本人长期出售超大量微博数据、旅游网站评论数据,并提供各种指定数据爬取服务,Message to YuboonaZhang@Yahoo.com。由于微博接口更新后限制增大,这个代码已经不能用来爬数据了。如果只是为了收集数据可以咨询我的邮箱,如果是为了学习爬虫,...
阅读 4384·2021-11-22 13:52
阅读 2260·2021-09-22 15:12
阅读 1334·2019-08-30 15:53
阅读 3605·2019-08-29 17:12
阅读 2316·2019-08-29 16:23
阅读 1821·2019-08-26 13:56
阅读 2009·2019-08-26 13:44
阅读 2057·2019-08-26 11:56