purpose

Climb sogou pictures thousands of beauty pictures and download to the local

The preparatory work

Crawl address:Pic.sogou.com/pics?query=… Analysis of the

Open the above address, press F12 developer tools – network-xhr – scroll down the page and the XHR bar will display the request information as follows:

The Request URL: pic.sogou.com/napi/pc/sea…

Analyze the main parameters of this request URL:

Start =48 indicates that the search starts from the 48th image

Xml_len =48 Retrieves 48 images from the bottom 48

Query =? Search keywords (example: beauty, here the browser automatically made transcoding, does not affect our use)Click on Respose and go to a JSON formatter.JSON format:www.bejson.com/

By analyzing the information Respose returns, we can see that the image address we want is placed in picUrl, Train of thought

Through the above analysis, it is not difficult to realize the download method, the idea is as follows:

1. Set URL request parameters 2. Access the URL request to obtain the image address 3. 4. Traverse the List and download it locally using the thread pool

code

Sougouimgprocessor. Java crawls the picture class

import com.alibaba.fastjson.JSONObject; import us.codecraft.webmagic.utils.HttpClientUtils; import victor.chang.crawler.pipeline.SougouImgPipeline; import java.util.ArrayList; import java.util.List; /** * A simple PageProcessor. * @author [email protected] <br> * @since 0.1.0 */ public class SougouImgProcessor { private String url; private SougouImgPipeline pipeline; private List<JSONObject> dataList; private List<String> urlList; private String word; public SougouImgProcessor(String url,String word) { this.url = url; this.word = word; this.pipeline = new SougouImgPipeline(); this.dataList = new ArrayList<>(); this.urlList = new ArrayList<>(); } public void process(int idx, int size) { String res = HttpClientUtils.get(String.format(this.url, idx, size, this.word)); JSONObject object = JSONObject.parseObject(res); List<JSONObject> items = (List<JSONObject>)((JSONObject)object.get("data")).get("items"); for(JSONObject item : items){ this.urlList.add(item.getString("picUrl")); } this.dataList.addAll(items); Public void pipelineData(){// Multithreaded pipeline.processsync (this.urlList, this.word); } public static void main(String[] args) { String url = "https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s"; SougouImgProcessor processor = new SougouImgProcessor(url," mycounter "); int start = 0, size = 50, limit = 1000; For (int I =start; i<start+limit; i+=size) processor.process(i, size); processor.pipelineData(); }}Copy the code

Sougouimgpipeline. Java image download class

import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; import java.net.URL; import java.net.URLConnection; import java.util.List; import java.util.Objects; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; /** * Store results in files.<br> * @author [email protected] <br> * @since 0.1.0 */ public class SougouImgPipeline  { private String extension = ".jpg"; private String path; private volatile AtomicInteger suc; private volatile AtomicInteger fails; public SougouImgPipeline() { setPath("E:/pipeline/sougou"); suc = new AtomicInteger(); fails = new AtomicInteger(); } public SougouImgPipeline(String path) { setPath(path); suc = new AtomicInteger(); fails = new AtomicInteger(); } public SougouImgPipeline(String path, String extension) { setPath(path); this.extension = extension; suc = new AtomicInteger(); fails = new AtomicInteger(); } public void setPath(String path) { this.path = path; } /** * Download * @param URL * @param cate * @throws Exception */ private void downloadImg(String URL, String cate, String name) throws Exception { String path = this.path + "/" + cate + "/"; File dir = new File(path); if (! Dir.exists ()) {// create directory dir.mkdirs() if the directory does not exist; } String realExt = url.substring(url.lastIndexOf(".")); String fileName = name + realExt; fileName = fileName.replace("-", ""); String filePath = path + fileName; File img = new File(filePath); If (img.exists()){// Skip system.out.println (string. format(" file %s already exists in a local directory ",fileName)) if(img.exists() {// Skip system.out.println (String. return; } URLConnection con = new URL(url).openConnection(); con.setConnectTimeout(5000); con.setReadTimeout(5000); InputStream inputStream = con.getInputStream(); byte[] bs = new byte[1024]; File file = new File(filePath); FileOutputStream os = new FileOutputStream(file, true); Int len; while ((len = inputStream.read(bs)) ! = -1) { os.write(bs, 0, len); } System.out.println("picUrl: " + url); System.out.println(string. format(" downloading the %s image ", suc.getAndIncrement())); Public void process(List<String> data, List<String> data, List<String> data) String word) { long start = System.currentTimeMillis(); for (String picUrl : data) { if (picUrl == null) continue; try { downloadImg(picUrl, word, picUrl); } catch (Exception e) { fails.incrementAndGet(); }} system.out.println ("下载 : "+ suc.get()); System.out.println(" failed to download: "+ fail.get ()); long end = System.currentTimeMillis(); System.out.println(" time: "+ (end-start) / 1000 +" seconds "); Public void processSync(List<String> data, List<String> data) String word) { long start = System.currentTimeMillis(); int count = 0; ExecutorService executorService = Executors.newCachedThreadPool(); For (int I =0; i<data.size(); i++) { String picUrl = data.get(i); if (picUrl == null) continue; String name = ""; if(i<10){ name="000"+i; }else if(i<100){ name="00"+i; }else if(i<1000){ name="0"+i; } String finalName = name; executorService.execute(() -> { try { downloadImg(picUrl, word, finalName); } catch (Exception e) { fails.incrementAndGet(); }}); count++; } executorService.shutdown(); try { if (! ExecutorService. AwaitTermination (60, TimeUnit. SECONDS)) {/ / timeout to all threads in the thread pool interrupt (interrupted). // executorService.shutdownNow(); } System.out.println("AwaitTermination Finished"); System.out.println(" common URL: "+data.size()); System.out.println(" download success: "+ suc); System.out.println(" Fails to download: "+ fails); File dir = new File(this.path + "/" + word + "/"); int len = Objects.requireNonNull(dir.list()).length; System.out.println(" Current shared file: "+len); long end = System.currentTimeMillis(); System.out.println(" time: "+ (end-start) / 1000.0 +" seconds "); } catch (InterruptedException e) { e.printStackTrace(); Public void processSync2(List<String> data, List<String> data, List<String> data) final String word, int threadNum) { if (data.size() < threadNum) { process(data, word); } else { ExecutorService executorService = Executors.newCachedThreadPool(); int num = data.size() / threadNum; For (int I = 0; i < threadNum; i++) { int start = i * num; int end = (i + 1) * num; if (i == threadNum - 1) { end = data.size(); } final List<String> cutList = data.subList(start, end); executorService.execute(() -> process(cutList, word)); } executorService.shutdown(); }}}Copy the code

Httpclientutils. Java HTTP request utility class

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.security.GeneralSecurityException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * @author [email protected]
 * Date: 17/3/27
 */
public abstract class HttpClientUtils {

    public static Map<String, List<String>> convertHeaders(Header[] headers) {
        Map<String, List<String>> results = new HashMap<String, List<String>>();
        for (Header header : headers) {
            List<String> list = results.get(header.getName());
            if (list == null) {
                list = new ArrayList<String>();
                results.put(header.getName(), list);
            }
            list.add(header.getValue());
        }
        return results;
    }

    /**
     * http的get请求
     * @param url
     */
    public static String get(String url) {
        return get(url, "UTF-8");
    }

    public static Logger logger = LoggerFactory.getLogger(HttpClientUtils.class);

    /**
     * http的get请求
     * @param url
     */
    public static String get(String url, String charset) {
        HttpGet httpGet = new HttpGet(url);
        return executeRequest(httpGet, charset);
    }

    /**
     * http的get请求,增加异步请求头参数
     * @param url
     */
    public static String ajaxGet(String url) {
        return ajaxGet(url, "UTF-8");
    }

    /**
     * http的get请求,增加异步请求头参数
     *
     * @param url
     */
    public static String ajaxGet(String url, String charset) {
        HttpGet httpGet = new HttpGet(url);
        httpGet.setHeader("X-Requested-With", "XMLHttpRequest");
        return executeRequest(httpGet, charset);
    }

    /**
     * @param url
     * @return
     */
    public static String ajaxGet(CloseableHttpClient httpclient, String url) {
        HttpGet httpGet = new HttpGet(url);
        httpGet.setHeader("X-Requested-With", "XMLHttpRequest");
        return executeRequest(httpclient, httpGet, "UTF-8");
    }

    /**
     * http的post请求,传递map格式参数
     */
    public static String post(String url, Map<String, String> dataMap) {
        return post(url, dataMap, "UTF-8");
    }

    /**
     * http的post请求,传递map格式参数
     */
    public static String post(String url, Map<String, String> dataMap, String charset) {
        HttpPost httpPost = new HttpPost(url);
        try {
            if (dataMap != null) {
                List<NameValuePair> nvps = new ArrayList<NameValuePair>();
                for (Map.Entry<String, String> entry : dataMap.entrySet()) {
                    nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
                }
                UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset);
                formEntity.setContentEncoding(charset);
                httpPost.setEntity(formEntity);
            }
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        return executeRequest(httpPost, charset);
    }

    /**
     * http的post请求,增加异步请求头参数,传递map格式参数
     */
    public static String ajaxPost(String url, Map<String, String> dataMap) {
        return ajaxPost(url, dataMap, "UTF-8");
    }

    /**
     * http的post请求,增加异步请求头参数,传递map格式参数
     */
    public static String ajaxPost(String url, Map<String, String> dataMap, String charset) {
        HttpPost httpPost = new HttpPost(url);
        httpPost.setHeader("X-Requested-With", "XMLHttpRequest");
        try {
            if (dataMap != null) {
                List<NameValuePair> nvps = new ArrayList<NameValuePair>();
                for (Map.Entry<String, String> entry : dataMap.entrySet()) {
                    nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
                }
                UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset);
                formEntity.setContentEncoding(charset);
                httpPost.setEntity(formEntity);
            }
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        return executeRequest(httpPost, charset);
    }

    /**
     * http的post请求,增加异步请求头参数,传递json格式参数
     */
    public static String ajaxPostJson(String url, String jsonString) {
        return ajaxPostJson(url, jsonString, "UTF-8");
    }

    /**
     * http的post请求,增加异步请求头参数,传递json格式参数
     */
    public static String ajaxPostJson(String url, String jsonString, String charset) {
        HttpPost httpPost = new HttpPost(url);
        httpPost.setHeader("X-Requested-With", "XMLHttpRequest");
        
        StringEntity stringEntity = new StringEntity(jsonString, charset);// 解决中文乱码问题
        stringEntity.setContentEncoding(charset);
        stringEntity.setContentType("application/json");
        httpPost.setEntity(stringEntity);
        return executeRequest(httpPost, charset);
    }

    /**
     * 执行一个http请求,传递HttpGet或HttpPost参数
     */
    public static String executeRequest(HttpUriRequest httpRequest) {
        return executeRequest(httpRequest, "UTF-8");
    }

    /**
     * 执行一个http请求,传递HttpGet或HttpPost参数
     */
    public static String executeRequest(HttpUriRequest httpRequest, String charset) {
        CloseableHttpClient httpclient;
        if ("https".equals(httpRequest.getURI().getScheme())) {
            httpclient = createSSLInsecureClient();
        } else {
            httpclient = HttpClients.createDefault();
        }
        String result = "";
        try {
            try {
                CloseableHttpResponse response = httpclient.execute(httpRequest);
                HttpEntity entity = null;
                try {
                    entity = response.getEntity();
                    result = EntityUtils.toString(entity, charset);
                } finally {
                    EntityUtils.consume(entity);
                    response.close();
                }
            } finally {
                httpclient.close();
            }
        } catch (IOException ex) {
            ex.printStackTrace();
        }
        return result;
    }

    public static String executeRequest(CloseableHttpClient httpclient, HttpUriRequest httpRequest, String charset) {
        String result = "";
        try {
            try {
                CloseableHttpResponse response = httpclient.execute(httpRequest);
                HttpEntity entity = null;
                try {
                    entity = response.getEntity();
                    result = EntityUtils.toString(entity, charset);
                } finally {
                    EntityUtils.consume(entity);
                    response.close();
                }
            } finally {
                httpclient.close();
            }
        } catch (IOException ex) {
            ex.printStackTrace();
        }
        return result;
    }

    /**
     * 创建 SSL连接
     */
    public static CloseableHttpClient createSSLInsecureClient() {
        try {
            SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(new TrustStrategy() {
                @Override
                public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException {
                    return true;
                }
            }).build();
            SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext, new HostnameVerifier() {
                @Override
                public boolean verify(String hostname, SSLSession session) {
                    return true;
                }
            });
            return HttpClients.custom().setSSLSocketFactory(sslsf).build();
        } catch (GeneralSecurityException ex) {
            throw new RuntimeException(ex);
        }
    }
}
Copy the code

run

Due to network and other reasons, we found that not all downloads were successful, but we can run multiple attempts to achieve a higher success rate of download. 666, very impressive.

Source: blog.csdn.net/qq_35402412/article/details/113627625