摘要:本人建立个群作为去转盘网的官方群,人数现在也不多,如果有兴趣的话来逛逛吧,多个粉丝去转盘多一份热闹,群号
之前我在写百度网盘爬虫,百度图片爬虫的时候答应网友说,抽时间要把ok搜搜的的源码公开,如今是时候兑现诺言了,下面就是爬虫的所有代码,完全,彻底的公开,你会不会写程序都可以使用,不过请先装个linux系统,具备公网条件,然后运行:
python startCrawler.py
有必要提醒你,数据库字段代码中都有,请你自己建张表格,这个太简单了,就不多说了。同时我也提供一下下载地址,源码都在:下载地址1 下载地址2
#!/usr/bin/env python # encoding: utf-8 """ author:haoning create time:2015.8.1 """ import hashlib import os import time import datetime import traceback import sys import random import json import socket import threading from hashlib import sha1 #进行hash加密 from random import randint from struct import unpack from socket import inet_ntoa from threading import Timer, Thread from time import sleep from collections import deque from Queue import Queue import MySQLdb as mdb #数据库连接器 import metautils import downloadTorrent from bencode import bencode, bdecode import pygeoip DB_HOST = "127.0.0.1" DB_USER = "root" DB_PASS = "root" BOOTSTRAP_NODES = ( ("67.215.246.10", 6881), ("82.221.103.244", 6881), ("23.21.224.150", 6881) ) RATE = 1 #调控速率 TID_LENGTH = 2 RE_JOIN_DHT_INTERVAL = 3 TOKEN_LENGTH = 2 INFO_HASH_LEN = 500000 #50w数据很小,限制内存不至于消耗太大 CACHE_LEN = 100 #更新数据库缓存 WAIT_DOWNLOAD = 80 geoip = pygeoip.GeoIP("GeoIP.dat") def is_ip_allowed(ip): country = geoip.country_code_by_addr(ip) if country in ("CN","TW","JP","HK", "KR"): return True return False def entropy(length): return "".join(chr(randint(0, 255)) for _ in xrange(length)) def random_id(): h = sha1() h.update(entropy(20)) return h.digest() def decode_nodes(nodes): n = [] length = len(nodes) if (length % 26) != 0: return n for i in range(0, length, 26): nid = nodes[i:i+20] ip = inet_ntoa(nodes[i+20:i+24]) port = unpack("!H", nodes[i+24:i+26])[0] n.append((nid, ip, port)) return n def timer(t, f): Timer(t, f).start() def get_neighbor(target, nid, end=10): return target[:end]+nid[end:] class KNode(object): def __init__(self, nid, ip, port): self.nid = nid self.ip = ip self.port = port class DHTClient(Thread): def __init__(self, max_node_qsize): Thread.__init__(self) self.setDaemon(True) self.max_node_qsize = max_node_qsize self.nid = random_id() self.nodes = deque(maxlen=max_node_qsize) def send_krpc(self, msg, address): try: self.ufd.sendto(bencode(msg), address) except Exception: pass def send_find_node(self, address, nid=None): nid = get_neighbor(nid, self.nid) if nid else self.nid tid = entropy(TID_LENGTH) msg = { "t": tid, "y": "q", "q": "find_node", "a": { "id": nid, "target": random_id() } } self.send_krpc(msg, address) def join_DHT(self): for address in BOOTSTRAP_NODES: self.send_find_node(address) def re_join_DHT(self): if len(self.nodes) == 0: self.join_DHT() timer(RE_JOIN_DHT_INTERVAL, self.re_join_DHT) def auto_send_find_node(self): wait = 1.0 / self.max_node_qsize while True: try: node = self.nodes.popleft() self.send_find_node((node.ip, node.port), node.nid) except IndexError: pass try: sleep(wait) except KeyboardInterrupt: os._exit(0) def process_find_node_response(self, msg, address): nodes = decode_nodes(msg["r"]["nodes"]) for node in nodes: (nid, ip, port) = node if len(nid) != 20: continue if ip == self.bind_ip: continue n = KNode(nid, ip, port) self.nodes.append(n) class DHTServer(DHTClient): #获得info_hash def __init__(self, master, bind_ip, bind_port, max_node_qsize): DHTClient.__init__(self, max_node_qsize) self.master = master self.bind_ip = bind_ip self.bind_port = bind_port self.speed=0 self.process_request_actions = { "get_peers": self.on_get_peers_request, "announce_peer": self.on_announce_peer_request, } self.ufd = socket.socket(socket.AF_INET, socket.SOCK_DGRAM, socket.IPPROTO_UDP) self.ufd.bind((self.bind_ip, self.bind_port)) timer(RE_JOIN_DHT_INTERVAL, self.re_join_DHT) def run(self): self.re_join_DHT() while True: try: (data, address) = self.ufd.recvfrom(65536) msg = bdecode(data) self.on_message(msg, address) except Exception: pass def on_message(self, msg, address): global RATE #设为全局量 try: if msg["y"] == "r": if msg["r"].has_key("nodes"): self.process_find_node_response(msg, address) #发现节点 elif msg["y"] == "q": try: self.speed+=1 if self.speed % 10000 ==0: RATE=random.randint(1,3) if RATE==2: RATE=1 if RATE==3: RATE=10 if self.speed>100000: self.speed=0 if self.speed % RATE==0: #数据过多,占用cpu太多,划分限速,1,1,10 self.process_request_actions[msg["q"]](msg, address) #处理其他节点的请求,这个过程获取info_hash #self.process_request_actions[msg["q"]](msg, address) #处理其他节点的请求,这个过程获取info_hash except KeyError: self.play_dead(msg, address) except KeyError: pass def on_get_peers_request(self, msg, address): try: infohash = msg["a"]["info_hash"] tid = msg["t"] nid = msg["a"]["id"] token = infohash[:TOKEN_LENGTH] msg = { "t": tid, "y": "r", "r": { "id": get_neighbor(infohash, self.nid), "nodes": "", "token": token } } self.master.log(infohash, address) self.send_krpc(msg, address) except KeyError: pass def on_announce_peer_request(self, msg, address): try: infohash = msg["a"]["info_hash"] token = msg["a"]["token"] nid = msg["a"]["id"] tid = msg["t"] if infohash[:TOKEN_LENGTH] == token: if msg["a"].has_key("implied_port ") and msg["a"]["implied_port "] != 0: port = address[1] else: port = msg["a"]["port"] self.master.log_announce(infohash, (address[0], port)) except Exception: print "error" pass finally: self.ok(msg, address) def play_dead(self, msg, address): try: tid = msg["t"] msg = { "t": tid, "y": "e", "e": [202, "Server Error"] } self.send_krpc(msg, address) except KeyError: pass def ok(self, msg, address): try: tid = msg["t"] nid = msg["a"]["id"] msg = { "t": tid, "y": "r", "r": { "id": get_neighbor(nid, self.nid) } } self.send_krpc(msg, address) except KeyError: pass class Master(Thread): #解析info_hash def __init__(self): Thread.__init__(self) self.setDaemon(True) self.queue = Queue() self.cache = Queue() self.count=0 self.mutex = threading.RLock() #可重入锁,使单线程可以再次获得已经获得的? self.waitDownload = Queue() self.metadata_queue = Queue() self.dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, "oksousou", charset="utf8") self.dbconn.autocommit(False) self.dbcurr = self.dbconn.cursor() self.dbcurr.execute("SET NAMES utf8") self.visited = set() def lock(self): #加锁 self.mutex.acquire() def unlock(self): #解锁 self.mutex.release() def work(self,item): print "start thread",item while True: self.prepare_download_metadata() self.lock() self.download_metadata() self.unlock() self.lock() self.got_torrent() self.unlock() def start_work(self,max): for item in xrange(max): t = threading.Thread(target=self.work, args=(item,)) t.setDaemon(True) t.start() #入队的种子效率更高 def log_announce(self, binhash, address=None): if self.queue.qsize() < INFO_HASH_LEN : #大于INFO_HASH_LEN就不要入队,否则后面来不及处理 if is_ip_allowed(address[0]): self.queue.put([address, binhash]) #获得info_hash def log(self, infohash, address=None): if self.queue.qsize() < INFO_HASH_LEN: #大于INFO_HASH_LEN/2就不要入队,否则后面来不及处理 if is_ip_allowed(address[0]): self.queue.put([address, infohash]) def prepare_download_metadata(self): if self.queue.qsize() == 0: sleep(2) #从queue中获得info_hash用来下载 address, binhash= self.queue.get() if binhash in self.visited: return if len(self.visited) > 100000: #大于100000重置队列,认为已经访问过了 self.visited = set() self.visited.add(binhash) #跟新已经访问过的info_hash info_hash = binhash.encode("hex") utcnow = datetime.datetime.utcnow() self.cache.put((address,binhash,utcnow)) #装入缓存队列 def download_metadata(self): if self.cache.qsize() > CACHE_LEN/2: #出队更新下载 while self.cache.qsize() > 0: #排空队列 address,binhash,utcnow = self.cache.get() info_hash = binhash.encode("hex") self.dbcurr.execute("SELECT id FROM search_hash WHERE info_hash=%s", (info_hash,)) y = self.dbcurr.fetchone() if y: # 更新最近发现时间,请求数 self.dbcurr.execute("UPDATE search_hash SET last_seen=%s, requests=requests+1 WHERE info_hash=%s", (utcnow, info_hash)) else: self.waitDownload.put((address, binhash)) self.dbconn.commit() if self.waitDownload.qsize() > WAIT_DOWNLOAD: while self.waitDownload.qsize() > 0: address,binhash = self.waitDownload.get() t = threading.Thread(target=downloadTorrent.download_metadata, args=(address, binhash, self.metadata_queue)) t.setDaemon(True) t.start() def decode(self, s): if type(s) is list: s = ";".join(s) u = s for x in (self.encoding, "utf8", "gbk", "big5"): try: u = s.decode(x) return u except: pass return s.decode(self.encoding, "ignore") def decode_utf8(self, d, i): if i+".utf-8" in d: return d[i+".utf-8"].decode("utf8") return self.decode(d[i]) def parse_metadata(self, data): #解析种子 info = {} self.encoding = "utf8" try: torrent = bdecode(data) #编码后解析 if not torrent.get("name"): return None except: return None detail = torrent info["name"] = self.decode_utf8(detail, "name") if "files" in detail: info["files"] = [] for x in detail["files"]: if "path.utf-8" in x: v = {"path": self.decode("/".join(x["path.utf-8"])), "length": x["length"]} else: v = {"path": self.decode("/".join(x["path"])), "length": x["length"]} if "filehash" in x: v["filehash"] = x["filehash"].encode("hex") info["files"].append(v) info["length"] = sum([x["length"] for x in info["files"]]) else: info["length"] = detail["length"] info["data_hash"] = hashlib.md5(detail["pieces"]).hexdigest() return info def got_torrent(self): if self.metadata_queue.qsize() == 0: return binhash, address, data,start_time = self.metadata_queue.get() if not data: return try: info = self.parse_metadata(data) if not info: return except: traceback.print_exc() return temp = time.time() x = time.localtime(float(temp)) utcnow = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now info_hash = binhash.encode("hex") #磁力 info["info_hash"] = info_hash # need to build tags info["tagged"] = False info["classified"] = False info["requests"] = 1 info["last_seen"] = utcnow info["create_time"] = utcnow info["source_ip"] = address[0] if info.get("files"): files = [z for z in info["files"] if not z["path"].startswith("_")] if not files: files = info["files"] else: files = [{"path": info["name"], "length": info["length"]}] files.sort(key=lambda z:z["length"], reverse=True) bigfname = files[0]["path"] info["extension"] = metautils.get_extension(bigfname).lower() info["category"] = metautils.get_category(info["extension"]) try: try: print " ", "Saved", info["info_hash"], info["name"], (time.time()-start_time), "s", address[0] except: print " ", "Saved", info["info_hash"] ret = self.dbcurr.execute("INSERT INTO search_hash(info_hash,category,data_hash,name,extension,classified,source_ip,tagged," + "length,create_time,last_seen,requests) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (info["info_hash"], info["category"], info["data_hash"], info["name"], info["extension"], info["classified"], info["source_ip"], info["tagged"], info["length"], info["create_time"], info["last_seen"], info["requests"])) if self.count %50 ==0: self.dbconn.commit() if self.count>100000: self.count=0 except: print self.name, "save error", self.name, info traceback.print_exc() return if __name__ == "__main__": #启动客户端 master = Master() master.start_work(150) #启动服务器 dht = DHTServer(master, "0.0.0.0", 6881, max_node_qsize=200) dht.start() dht.auto_send_find_node()
注意,上面的代码有一段代码需要下载种子,所以下面的这段代码十分重要:
#!/usr/bin/env python # encoding: utf-8 """ author:haoning create time:2015.8.1 """ from hashlib import sha1 import math from socket import inet_ntoa import socket from struct import pack, unpack from threading import Timer, Thread from time import sleep, time from bencode import bencode, bdecode from startCrawler import entropy BT_PROTOCOL = "BitTorrent protocol" BT_MSG_ID = 20 EXT_HANDSHAKE_ID = 0 def random_id(): hash = sha1() hash.update(entropy(20)) return hash.digest() def send_packet(the_socket, msg): the_socket.send(msg) def send_message(the_socket, msg): msg_len = pack(">I", len(msg)) send_packet(the_socket, msg_len + msg) def send_handshake(the_socket, infohash): bt_header = chr(len(BT_PROTOCOL)) + BT_PROTOCOL ext_bytes = "x00x00x00x00x00x10x00x00" peer_id = random_id() packet = bt_header + ext_bytes + infohash + peer_id send_packet(the_socket, packet) def check_handshake(packet, self_infohash): try: bt_header_len, packet = ord(packet[:1]), packet[1:] if bt_header_len != len(BT_PROTOCOL): return False except TypeError: return False bt_header, packet = packet[:bt_header_len], packet[bt_header_len:] if bt_header != BT_PROTOCOL: return False packet = packet[8:] infohash = packet[:20] if infohash != self_infohash: return False return True def send_ext_handshake(the_socket): msg = chr(BT_MSG_ID) + chr(EXT_HANDSHAKE_ID) + bencode({"m":{"ut_metadata": 1}}) send_message(the_socket, msg) def request_metadata(the_socket, ut_metadata, piece): """bep_0009""" msg = chr(BT_MSG_ID) + chr(ut_metadata) + bencode({"msg_type": 0, "piece": piece}) send_message(the_socket, msg) def get_ut_metadata(data): ut_metadata = "_metadata" index = data.index(ut_metadata)+len(ut_metadata) + 1 return int(data[index]) def get_metadata_size(data): metadata_size = "metadata_size" start = data.index(metadata_size) + len(metadata_size) + 1 data = data[start:] return int(data[:data.index("e")]) def recvall(the_socket, timeout=5): the_socket.setblocking(0) total_data = [] data = "" begin = time() while True: sleep(0.05) if total_data and time()-begin > timeout: break elif time()-begin > timeout*2: break try: data = the_socket.recv(1024) if data: total_data.append(data) begin = time() except Exception: pass return "".join(total_data) def download_metadata(address, infohash, metadata_queue, timeout=5): metadata = None start_time = time() the_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: the_socket.settimeout(timeout) the_socket.connect(address) # handshake send_handshake(the_socket, infohash) packet = the_socket.recv(4096) # handshake error if not check_handshake(packet, infohash): return # ext handshake send_ext_handshake(the_socket) packet = the_socket.recv(4096) # get ut_metadata and metadata_size ut_metadata, metadata_size = get_ut_metadata(packet), get_metadata_size(packet) # request each piece of metadata metadata = [] for piece in range(int(math.ceil(metadata_size/(16.0*1024)))): #piece是个控制块,根据控制块下载数据 request_metadata(the_socket, ut_metadata, piece) packet = recvall(the_socket, timeout) #the_socket.recv(1024*17) metadata.append(packet[packet.index("ee")+2:]) metadata = "".join(metadata) except socket.timeout: pass except Exception, e: pass finally: #print "metadata= %s" %(metadata) the_socket.close() #确保没回都关闭socket if metadata != None: #只让不空的种子入? metadata_queue.put((infohash, address, metadata,start_time))
其实下载种子还有一种方式就是借助libtorrent,但这个太耗费cpu了,所以我一般不用他,如下:
#coding: utf8 import threading import traceback import random import time import os import socket import libtorrent as lt threading.stack_size(200*1024) socket.setdefaulttimeout(30) def fetch_torrent(session, ih, timeout): name = ih.upper() url = "magnet:?xt=urn:btih:%s" % (name,) data = "" params = { "save_path": "/tmp/downloads/", "storage_mode": lt.storage_mode_t(2), "paused": False, "auto_managed": False, "duplicate_is_error": True} try: handle = lt.add_magnet_uri(session, url, params) except: return None status = session.status() handle.set_sequential_download(1) meta = None down_time = time.time() down_path = None for i in xrange(0, timeout): if handle.has_metadata(): info = handle.get_torrent_info() down_path = "/tmp/downloads/%s" % info.name() #print "status", "p", status.num_peers, "g", status.dht_global_nodes, "ts", status.dht_torrents, "u", status.total_upload, "d", status.total_download meta = info.metadata() break time.sleep(1) if down_path and os.path.exists(down_path): os.system("rm -rf "%s"" % down_path) session.remove_torrent(handle) return meta def download_metadata(address, binhash, metadata_queue, timeout=20): metadata = None start_time = time.time() try: session = lt.session() r = random.randrange(10000, 50000) session.listen_on(r, r+10) session.add_dht_router("router.bittorrent.com",6881) session.add_dht_router("router.utorrent.com",6881) session.add_dht_router("dht.transmission.com",6881) session.add_dht_router("127.0.0.1",6881) session.start_dht() metadata = fetch_torrent(session, binhash.encode("hex"), timeout) session = None except: traceback.print_exc() finally: metadata_queue.put((binhash, address, metadata,start_time))
这个爬虫还是耗费了本人和其他网上高手的很多时间的,请看到这篇博客的朋友保持钻研精神,开源精神,多多交流,秉承分享。本人建立个qq群作为去转盘网的官方群,人数现在也不多,如果有兴趣的话来逛逛吧,多个粉丝去转盘多一份热闹,qq群号:512245829
文章版权归作者所有,未经允许请勿转载,若此文章存在违规行为,您可以联系管理员删除。
转载请注明本文地址:https://www.ucloud.cn/yun/45501.html
摘要:闲话不多说了,接下来谈谈网络爬虫吧。根据中的到指定端口使用扩展协议进行数据的交换即下载下载成功,解析出种子文件列表信息入库。具体实现请参考我的开源项目代码如有问题,欢迎指正,仅供技术交流,切勿用作非法商业用途。 演示地址: https://dodder.cc 三年前,照着 Python 版的 DHT 网络爬虫用 Java 重写了一遍,当时大学还未毕业,写出来的代码比较杂乱,数据跑到 1...
摘要:项目简介前端站点项目效果预览使用实现磁力链接爬虫磁力链接解析成种子信息,保存到数据库,利用实现中文检索。搭建磁力链接搜索引擎源码地址后端脚本磁力链接获取磁力链接解析入库定时同步源码地址此项目仅用学习交流技术使用不做商业用途。 项目简介 前端站点 项目效果预览 http://findcl.com 使用 nodejs 实现磁力链接爬虫 磁力链接解析成 torrent种子信息,保存到数据...
阅读 2410·2023-04-25 22:15
阅读 1704·2021-11-19 09:40
阅读 2106·2021-09-30 09:48
阅读 3174·2021-09-03 10:36
阅读 2002·2021-08-30 09:48
阅读 1810·2021-08-24 10:00
阅读 2694·2019-08-30 15:54
阅读 660·2019-08-30 15:54