写点什么

用 Java 爬美女图片,这个厉害了!

  • 2021 年 11 月 12 日
  • 本文字数:4666 字

    阅读完需:约 15 分钟

爬取搜狗图片上千张美女图片并下载到本地

准备工作

爬取地址:https://pic.sogou.com/pics?query=%E7%BE%8E%E5%A5%B3

分析

打开上面的地址,按 F12 开发者工具 - NetWork - XHR - 页面往下滑动 XHR 栏出现请求信息如下:


Request URL :https://pic.sogou.com/napi/pc/searchList?mode=1&start=48&xml_len=48&query=%E7%BE%8E%E5%A5%B3


分析这段请求 URL 的主要几个参数:


start=48 表示从第 48 张图片开始检索


xml_len=48 从地 48 张往后获取 48 张图片


query=?搜索关键词(例:美女,这里浏览器自动做了转码,不影响我们使用)



点击 Respose,找个 JSON 格式器辅助过去看看。



JSON 格式:https://www.bejson.com/


分析 Respose 返回的信息,可以发现我们想要的图片地址放在 picUrl 里,


思路

通过以上分析,不难实现下载方法,思路如下:


  1. 设置 URL 请求参数

  2. 访问 URL 请求,获取图片地址

  3. 图片地址存入 List

  4. 遍历 List,使用线程池下载到本地

代码

SougouImgProcessor.java 爬取图片类


import com.alibaba.fastjson.JSONObject;


import us.codecraft.webmagic.utils.HttpClientUtils;


import victor.chang.crawler.pipeline.SougouImgPipeline;


import java.util.ArrayList;


import java.util.List;


/**


  • A simple PageProcessor.

  • @author code4crafter@gmail.com <br>

  • @since 0.1.0


*/


public class SougouImgProcessor {


private String url;


private SougouImgPipeline pipeline;


private List<JSONObject> dataList;


private List<String> urlList;


private String word;


public SougouImgProcessor(String url,String word) {


this.url = url;


this.word = word;


this.pipeline = new SougouImgPipeline();


this.dataList = new ArrayList<>();


this.urlList = new ArrayList<>();


}


public void process(int idx, int size) {


String res = HttpClientUtils.get(String.format(this.url, idx, size, this.word));


JSONObject object = JSONObject.parseObject(res);


List<JSONObject> items = (List<JSONObject>)((JSONObject)object.get("data")).get("items");


for(JSONObject item : items){


this.urlList.add(item.getString("picUrl"));


}


this.dataList.addAll(items);


}


// 下载


public void pipelineData(){


// 多线程


pipeline.processSync(this.urlList, this.word);


}


public static void main(String[] args) {


String url = "https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s";


SougouImgProcessor processor = new SougouImgProcessor(url,"美女");


int start = 0, size = 50, limit = 1000; // 定义爬取开始索引、每次爬取数量、总共爬取数量


for(int i=start;i<start+limit;i+=size)


processor.process(i, size);


processor.pipelineData();


}


}


SougouImgPipeline.java 图片下载类


import java.io.File;


import java.io.FileOutputStream;


import java.io.InputStream;


import java.net.URL;


import java.net.URLConnection;


import java.util.List;


import java.util.Objects;


import java.util.concurrent.ExecutorService;


import java.util.concurrent.Executors;


import java.util.concurrent.TimeUnit;


import java.util.concurrent.atomic.AtomicInteger;


/**


  • Store results in files.<br>

  • @author code4crafter@gmail.com <br>

  • @since 0.1.0


*/


public class SougouImgPipeline {


private String extension = ".jpg";


private String path;


private volatile AtomicInteger suc;


private volatile AtomicInteger fails;


public SougouImgPipeline() {


setPath("E:/pipeline/sougou");


suc = new AtomicInteger();


fails = new AtomicInteger();


}


public SougouImgPipeline(String path) {


setPath(path);


suc = new AtomicInteger();


fails = new AtomicInteger();


}


public SougouImgPipeline(String path, String extension) {


setPath(path);


this.extension = extension;


suc = new AtomicInteger();


fails = new AtomicInteger();


}


public void setPath(String path) {


this.path = path;


}


/**


  • 下载

  • @param url

  • @param cate

  • @throws Exception


*/


private void downloadImg(String url, String cate, String name) throws Exception {


String path = this.path + "/" + cate + "/";


File dir = new File(path);


if (!dir.exists()) { // 目录不存在则创建目录


dir.mkdirs();


}


String realExt = url.substring(url.lastIndexOf(".")); // 获取扩展名


String fileName = name + realExt;


fileName = fileName.replace("-", "");


String filePath = path + fileName;


File img = new File(filePath);


if(img.exists()){ // 若文件之前已经下载过,则跳过


System.out.println(String.format("文件 %s 已存在本地目录",fileName));


return;


}


URLConnection con = new URL(url).openConnection();


con.setConnectTimeout(5000);


con.setReadTimeout(5000);


InputStream inputStream = con.getInputStream();


byte[] bs = new byte[1024];


File file = new File(filePath);


FileOutputStream os = new FileOutputStream(file, true);


// 开始读取 写入


int len;


while ((len = inputStream.read(bs)) != -1) {


os.write(bs, 0, len);


}


System.out.println("picUrl: " + url);


System.out.println(String.format("正在下载第 %s 张图片", suc.getAndIncrement()));


}


/**


  • 单线程处理

  • @param data

  • @param word


*/


public void process(List<String> data, String word) {


long start = System.currentTimeMillis();


for (String picUrl : data) {


if (picUrl == null)


continue;


try {


downloadImg(picUrl, word, picUrl);


} catch (Exception e) {


fails.incrementAndGet();


}


}


System.out.println("下载成功: " + suc.get());


System.out.println("下载失败: " + fails.get());


long end = System.currentTimeMillis();


System.out.println("耗时:" + (end - start) / 1000 + "秒");


}


/**


  • 多线程处理

  • @param data

  • @param word


*/


public void processSync(List<String> data, String word) {


long start = System.currentTimeMillis();


int count = 0;


ExecutorService executorService = Executors.newCachedThreadPool(); // 创建缓存线程池


for (int i=0;i<data.size();i++) {


String picUrl = data.get(i);


if (picUrl == null)


continue;


String name = "";


if(i<10){


name="000"+i;


}else if(i<100){


name="00"+i;


}else if(i<1000){


name="0"+i;


}


String finalName = name;


executorService.execute(() -> {


try {


downloadImg(picUrl, word, finalName);


} catch (Exception e) {


fails.incrementAndGet();


}


});


count++;


}


executorService.shutdown();


try {


if (!executorService.awaitTermination(60, TimeUnit.SECONDS)) {


// 超时的时候向线程池中所有的线程发出中断(interrupted)。


// executorService.shutdownNow();


}


System.out.println("AwaitTermination Finished");


System.out.println("共有 URL: "+data.size());


System.out.println("下载成功: " + suc);


System.out.println("下载失败:


【一线大厂Java面试题解析+后端开发学习笔记+最新架构讲解视频+实战项目源码讲义】
浏览器打开:qq.cn.hn/FTf 免费领取
复制代码


" + fails);


File dir = new File(this.path + "/" + word + "/");


int len = Objects.requireNonNull(dir.list()).length;


System.out.println("当前共有文件: "+len);


long end = System.currentTimeMillis();


System.out.println("耗时:" + (end - start) / 1000.0 + "秒");


} catch (InterruptedException e) {


e.printStackTrace();


}


}


/**


  • 多线程分段处理

  • @param data

  • @param word

  • @param threadNum


*/


public void processSync2(List<String> data, final String word, int threadNum) {


if (data.size() < threadNum) {


process(data, word);


} else {


ExecutorService executorService = Executors.newCachedThreadPool();


int num = data.size() / threadNum; //每段要处理的数量


for (int i = 0; i < threadNum; i++) {


int start = i * num;


int end = (i + 1) * num;


if (i == threadNum - 1) {


end = data.size();


}


final List<String> cutList = data.subList(start, end);


executorService.execute(() -> process(cutList, word));


}


executorService.shutdown();


}


}


}


HttpClientUtils.java http 请求工具类


import org.apache.http.Header;


import org.apache.http.HttpEntity;


import org.apache.http.NameValuePair;


import org.apache.http.client.entity.UrlEncodedFormEntity;


import org.apache.http.client.methods.CloseableHttpResponse;


import org.apache.http.client.methods.HttpGet;


import org.apache.http.client.methods.HttpPost;


import org.apache.http.client.methods.HttpUriRequest;


import org.apache.http.conn.ssl.SSLConnectionSocketFactory;


import org.apache.http.conn.ssl.TrustStrategy;


import org.apache.http.entity.StringEntity;


import org.apache.http.impl.client.CloseableHttpClient;


import org.apache.http.impl.client.HttpClients;


import org.apache.http.message.BasicNameValuePair;


import org.apache.http.ssl.SSLContextBuilder;


import org.apache.http.util.EntityUtils;


import org.slf4j.Logger;


import org.slf4j.LoggerFactory;


import javax.net.ssl.HostnameVerifier;


import javax.net.ssl.SSLContext;


import javax.net.ssl.SSLSession;


import java.io.IOException;


import java.io.UnsupportedEncodingException;


import java.security.GeneralSecurityException;


import java.security.cert.CertificateException;


import java.security.cert.X509Certificate;


import java.util.ArrayList;


import java.util.HashMap;


import java.util.List;


import java.util.Map;


/**


  • @author code4crafter@gmail.com

  • Date: 17/3/27


*/


public abstract class HttpClientUtils {


public static Map<String, List<String>> convertHeaders(Header[] headers) {


Map<String, List<String>> results = new HashMap<String, List<String>>();


for (Header header : headers) {


List<String> list = results.get(header.getName());


if (list == null) {


list = new ArrayList<String>();


results.put(header.getName(), list);


}


list.add(header.getValue());


}


return results;


}


/**


  • http 的 get 请求

  • @param url


*/


public static String get(String url) {


return get(url, "UTF-8");


}


public static Logger logger = LoggerFactory.getLogger(HttpClientUtils.class);


/**


  • http 的 get 请求

  • @param url


*/


public static String get(String url, String charset) {


HttpGet httpGet = new HttpGet(url);


return executeRequest(httpGet, charset);


}


/**


  • http 的 get 请求,增加异步请求头参数

  • @param url


*/


public static String ajaxGet(String url) {


return ajaxGet(url, "UTF-8");


}


/**


  • http 的 get 请求,增加异步请求头参数

  • @param url


*/


public static String ajaxGet(String url, String charset) {


HttpGet httpGet = new HttpGet(url);


httpGet.setHeader("X-Requested-With", "XMLHttpRequest");


return executeRequest(httpGet, charset);


}


/**


  • @param url

  • @return


*/


public static String ajaxGet(CloseableHttpClient httpclient, String url) {


HttpGet httpGet = new HttpGet(url);


httpGet.setHeader("X-Requested-With", "XMLHttpRequest");


return executeRequest(httpclient, httpGet, "UTF-8");


}

评论

发布
暂无评论
用 Java 爬美女图片,这个厉害了!