Python3 crawls Baidu images based on keywords

introduce

A usable Baidu image crawl script, the only disadvantage is that this is a single thread
Run environment Python3.6.0
The reason for writing this script: to get an image
There are references to the creative process

The source code

# coding: UTF-8 import requests import OS import re # import JSON import itertools import urllib import sys #  http://blog.csdn.net/hbuxiaoshe/article/details/44780653 str_table = { '_z2C$q': ':', '_z&e3B': '.', 'AzdH3F': '/' } char_table = { 'w': 'a', 'k': 'b', 'v': 'c', '1': 'd', 'j': 'e', 'u': 'f', '2': 'g', 'i': 'h', 't': 'i', '3': 'j', 'h': 'k', 's': 'l', '4': 'm', 'g': 'n', '5': 'o', 'r': 'p', 'q': 'q', '6': 'r', 'f': 's', 'p': 't', '7': 'u', 'e': 'v', 'o': 'w', '8': '1', 'd': '2', 'n': '3', '9': '4', 'c': '5', 'm': '6', '0': '7', 'b': '8', 'l': '9', 'a': '0'} char_table = {ord(key): ord(value) for key, value in char_table.items()} for key, value in str_table.items(): Url = url.replace(key, value) return url.translate(char_table) # def buildUrls(word): word = urllib.parse.quote(word) url = r"http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&fp=result&queryWord={word}&cl=2&lm=-1&ie=ut f-8&oe=utf-8&st=-1&ic=0&word={word}&face=0&istype=2nc=1&pn={pn}&rn=60" urls = (url.format(word=word, pn=x) for x in itertools.count(start=0, Return urls re_url = re.compile(r'"objURL":"(.*?)"') # imgURL def resolveImgUrl(HTML): ImgUrls = [decode(x) for x in reurl.findall (HTML)] return imgUrls # def downImgs(imgUrl, dirpath, imgName, imgName) imgType): filename = os.path.join(dirpath, imgName) try: res = requests.get(imgUrl, timeout=15) if str(res.status_code)[0] == '4': print(str(res.status_code), ":", imgUrl) return False except Exception as e: Print (' throw exception :', imgUrl) print(e) return False with open(filename + '.' + imgType, 'wb') as f: F file (res.content) return True # def mkDir(dirName): dirpath = os.path.join(sys.path[0], dirName) if not os.path.exists(dirpath): os.mkdir(dirpath) return dirpath if __name__ == '__main__': Print (" download result saved in img folder in script directory ") # print(" download result saved in img folder in script directory" ) choosePath = input(' Please enter the path you want to save \n 1. Default path path = IMGS/ \n 2. Relative path path_input/path_input/ \n 3. Absolute path, such as D:/IMG/\n') if int(choosePath) == 3: dirPath = input(' Please enter the path where you want to save the image \n') elif int(choosePath) == 2: Path = input(' Please enter the path where you want to save the image \n') dirpath = mkDir(path) else: Path = 'IMGS' dirPath = mkDir(path) print("= = "* 25) word = input \n") print("= = "* 25) chooseImgType = input(' Please select image format \n 0. Default: JPG \n 1.jpg \n 2.png \n 3. GIF \n 4. Custom \n') chooseImgType = int(chooseImgType) if chooseImgType == 4: ImgType = INPUT (' Please enter custom image type \n') elif chooseImgType == 1: imgType = 'JPG' Elif chooseImgType == 2: imgType = 'png' elif chooseImgType == 3: imgType = 'gif' else: ImgType = 'JPG' print("= = "* 25) strtag = input(" Please enter the name of the image you want to download, the last format is number+" +name.%s\n" % imgType) print("= = "* 25) numIMGS = input(' please enter the number of images you want to download \n') numIMGS = int(numIMGS) urls = buildUrls(word) index = 0 print("= = "* 25) for url  in urls: Print (" request: ", url) html = requests.get(url, timeout=10).content.decode('utf-8') imgUrls = resolveImgUrl(html) # print(imgUrls) if len(imgUrls) == 0: ImgUrls: if downImgs(url, dirpath, STR (index + 1) + "+ strtag, imgType): If index == numIMGS: break if index == numIMGS: Print (' You downloaded %s images' % index) print(' program terminating ') breakCopy the code

Run the demo

reference

[1] baidu picture URL parsing blog.csdn.net/hbuxiaoshe/…
[2] Python crawl baidu pictures and py exe file conversion zhuanlan.zhihu.com/p/25605555

Python3 crawls Baidu images based on keywords

introduce

The source code

Run the demo

reference

Related Posts

Numpy operates on arrays of different dimensions

How to realize timing monitoring of streaming media system?

A UI layout framework that implements UI setup and layout control with minimal code