本文共 3472 字,大约阅读时间需要 11 分钟。
4.0.0 org.example SpiderJava 1.0-SNAPSHOT us.codecraft webmagic-core 0.7.3 us.codecraft webmagic-extension 0.7.3 org.apache.commons commons-lang3 3.10 org.projectlombok lombok 1.18.12 provided
package spider.pojo.po;import lombok.Data;import spider.common.annotation.CssSelector;import spider.common.annotation.Html;import java.util.List;/** * @Author lyr * @create 2020/6/23 19:42 */@Html@Datapublic class SimpleBlogHtml { @CssSelector(selector = ".oneline span") private Listtitle;}
package spider.common.bean;import lombok.SneakyThrows;import lombok.extern.slf4j.Slf4j;import spider.common.annotation.CssSelector;import spider.pojo.po.SimpleBlogHtml;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.processor.PageProcessor;import java.lang.reflect.Field;import java.util.List;/** * @Author lyr * @create 2020/6/23 18:50 * 处理器 */@Slf4jpublic class HtmlProcessor implements PageProcessor { @SneakyThrows public void process(Page page) { page.setCharset("UTF-8"); SimpleBlogHtml blogHtml = new SimpleBlogHtml(); for(Field field: SimpleBlogHtml.class.getDeclaredFields()) { field.setAccessible(true); CssSelector cssSelector = field.getAnnotation(CssSelector.class); System.out.println(cssSelector); if(cssSelector!=null) { String selector = cssSelector.selector(); Listresult = page.getHtml().css(selector).xpath("//span/text()").all(); field.set(blogHtml,result); } } // log.info(page.getHtml().css("div").all().toString()); System.out.println(blogHtml.getTitle()); page.putField("html",blogHtml); } private final Site site = new Site(); { site.setCharset("UTF-8"); } public Site getSite() { return site; }}
package spider.common.bean;import lombok.extern.slf4j.Slf4j;import us.codecraft.webmagic.ResultItems;import us.codecraft.webmagic.Task;import us.codecraft.webmagic.pipeline.Pipeline;/** * @Author lyr * @create 2020/6/23 19:53 */@Slf4jpublic class DownloadPipeLineimplements Pipeline { public void process(ResultItems resultItems, Task task) { T data = resultItems.get("html"); log.info("data {}",data); }}
package spider;import spider.common.bean.DownloadPipeLine;import spider.common.bean.HtmlProcessor;import spider.pojo.po.SimpleBlogHtml;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.pipeline.FilePipeline;/** * @Author lyr * @create 2020/6/23 18:52 */public class Main { public static void main(String[] args) { Spider.create(new HtmlProcessor()) .addUrl("https://blog.csdn.net/qq_43923045") .addPipeline(new DownloadPipeLine()) .run(); }}