Good news for women who don't go out for a long time. Movie Heaven latest movie crawl and search scripts

** Multithreaded movie Heaven latest resources crawl scripts, movie search scripts **PS: convenient for everyone to write into HTML to generate tables. Threads can be changed directly in the script. IP access may be restricted when thread 30 is tested. Environment: Python3 latest movie crawl code

# -*- coding: utf-8 -*- import random import threading import requests as req from lxml import etree from queue import Queue BASE_URL_COM = 'http://www.ygdy8.com' BASE_URL_NET = 'http://www.ygdy8.net' THREADS = 20 PAGE_TOTAL = 100 HEAD = '<! DOCTYPE HTML >< HTML lang="en"><head><meta charset="UTF-8"><title Href = "https://cdn.bootcss.com/bootstrap/4.0.0/css/bootstrap.min.css" rel = "stylesheet" > < / head > < body > < table Class = "table" > < thead class = "thead - dark" > < tr > < th scope = "col" > # < / th > < th scope = "col" > < / th > < movie name th The scope = "col" > download address < / th > < / tr > < thead > < tbody class = "table - hover" > 'FOOT =' < / tbody > < / table > < / body > < / HTML > 'count = 1 def get_headers(): User_agent_list = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, Like Gecko) Chrome/22.0.1207.1 Safari/537.1', 'Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, Like Gecko) Chrome/20.0.1132.57 Safari/536.11', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, Like Gecko) Chrome/20.0.1092.0 Safari/536.6', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, Like Gecko) Chrome/20.0.1090.0 Safari/536.6', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, Like Gecko) Chrome/19.77.34.5 Safari/537.1', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, Like Gecko) Chrome/19.0.1084.9 Safari/536.5', 'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, Like Gecko) Chrome/19.0.1084.36 Safari/536.5', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1063.0 Safari/536.3', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1063.0 Safari/536.3', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1063.0 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1062.0 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1062.0 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1061.1 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1061.1 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1061.1 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1061.0 Safari/536.3', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, Like Gecko) Chrome/19.0.1055.1 Safari/535.24', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, Like Gecko) Chrome/19.0.1055.1 Safari/535.24'] UA = random. Choice (user_agent_list) headers = {' user-agent ': UA} return headers def get_url(list_queue, url_queue): while True: url = list_queue.get() try: res = req.get(url, headers=get_headers()) res.encoding = res.apparent_encoding html = etree.HTML(res.text) tags = html.xpath('//div[@class="co_content8"]/ul//a') for tag in tags: href = tag.get('href') url_queue.put(href, 1) print('[Subscribe] [%s]' % href) except: print('[Subscribe Error] %s' % url) list_queue.task_done() def get_list(list_queue): lists = [i for i in range(1, PAGE_TOTAL + 1)] list_url = 'http://www.ygdy8.com/html/gndy/dyzz/list_23_%d.html' for i in lists: url = list_url % i list_queue.put(url, 1) def parse_download(url): res = req.get(url, headers=get_headers()) res.encoding = res.apparent_encoding html = etree.HTML(res.text) title = html.xpath('//div[@class="bd3r"]//div[@class="title_all"]/h1/font')[0].text downloads = html.xpath('//div[@id="Zoom"]//table//a/@href') return title, downloads def parse_html(url_queue, result_file): while True: global count url_path = url_queue.get() try: try: url = BASE_URL_COM + url_path (title, downloads) = parse_download(url) except: url = BASE_URL_NET + url_path (title, downloads) = parse_download(url) download = '<hr>'.join(downloads) tr = '<tr><th scope="row">%d</th><td>%s</td><td>%s</td></tr>' % (count, title, download) result_file.write(tr) print('[OK][%d] %s' % (count, title)) count = count + 1 except: print('[Parse error] %s' % url_path) url_queue.task_done() def thread(thread_name, target, args): for i in range(THREADS): t = threading.Thread(target=target, args=args) t.setDaemon(True) t.start() thread_name.join() def main(): list_queue = Queue() url_queue = Queue() get_list(list_queue) thread(list_queue, get_url, (list_queue, url_queue)) result_file = open('result.html', 'w') result_file.write(HEAD) thread(url_queue, parse_html, (url_queue, result_file)) result_file.write(FOOT) result_file.close() print('End... TieZi \nEnd... TieZi \nEnd... TieZi ') if __name__ == '__main__': main()Copy the code

Search for movie code

# -*- coding: utf-8 -*- import sys import random import requests as req from urllib import parse from lxml import etree from multiprocessing import Pool BASE_URL = 'http://www.ygdy8.com' SEARCH_URL = 'http://s.ygdy8.com/plus/so.php?kwtype=0&searchtype=title&pagesize=1000&keyword=' # keyword need URL character encoding def get_headers () : User_agent_list = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, Like Gecko) Chrome/22.0.1207.1 Safari/537.1', 'Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, Like Gecko) Chrome/20.0.1132.57 Safari/536.11', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, Like Gecko) Chrome/20.0.1092.0 Safari/536.6', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, Like Gecko) Chrome/20.0.1090.0 Safari/536.6', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, Like Gecko) Chrome/19.77.34.5 Safari/537.1', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, Like Gecko) Chrome/19.0.1084.9 Safari/536.5', 'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, Like Gecko) Chrome/19.0.1084.36 Safari/536.5', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1063.0 Safari/536.3', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1063.0 Safari/536.3', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1063.0 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1062.0 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1062.0 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1061.1 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1061.1 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1061.1 Safari/536.3', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, Like Gecko) Chrome/19.0.1061.0 Safari/536.3', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, Like Gecko) Chrome/19.0.1055.1 Safari/535.24', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, Like Gecko) Chrome/19.0.1055.1 Safari/535.24'] ua = random. Choice (user_agent_list) headers = {' user-agent ': ua} return headers def search(keyword): keyword = parse.quote(keyword.encode("gbk")) url = SEARCH_URL + keyword res = req.get(url, headers=get_headers()) res.encoding = res.apparent_encoding html = etree.HTML(res.text) tags = html.xpath('//div[@class="co_content8"]/ul//a') result_urls = [] for tag in tags: url = BASE_URL + tag.get('href') result_urls.append(url) return result_urls def parse_html(url): res = req.get(url, headers=get_headers()) res.encoding = res.apparent_encoding html = etree.HTML(res.text) title = html.xpath('//div[@class="bd3r"]//div[@class="title_all"]/h1/font')[0].text downloads = html.xpath('//div[@id="Zoom"]//table//a/@href') print('[%s]' % title) for download in downloads: Print (' [download] [% s] % download) print (' \ n | -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- - | \ n ') if __name__ = =  '__main__': if len(sys.argv) < 2: print("Usage: python %s movie_name" % sys.argv[0]) exit(-1) urls = search(sys.argv[1]) pool = Pool() pool.map(parse_html, urls)Copy the code

The latest movie crawls for effect

The generated HTML table

Movie Search results

The download link for crawling contains FTP and magnetic link. If the video source has magnetic link, it will be crawling

The search results

mo4tech.com (Moment For Technology) is a global community with thousands techies from across the global hang out!Passionate technologists, be it gadget freaks, tech enthusiasts, coders, technopreneurs, or CIOs, you would find them all here.

Good news for women who don’t go out for a long time. Movie Heaven latest movie crawl and search scripts

Good news for women who don’t go out for a long time. Movie Heaven latest movie crawl and search scripts

Related Posts

EMQ X Rule Engine series (xii) stores messages to Redis

Bad Software Design: An Imaginary problem

Integration of R&D and operation and maintenance, witness the ultimate synergy