博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
webmagic文本处理(爬虫项目)
阅读量:3952 次
发布时间:2019-05-24

本文共 3472 字,大约阅读时间需要 11 分钟。

4.0.0
org.example
SpiderJava
1.0-SNAPSHOT
us.codecraft
webmagic-core
0.7.3
us.codecraft
webmagic-extension
0.7.3
org.apache.commons
commons-lang3
3.10
org.projectlombok
lombok
1.18.12
provided
package spider.pojo.po;import lombok.Data;import spider.common.annotation.CssSelector;import spider.common.annotation.Html;import java.util.List;/** * @Author lyr * @create 2020/6/23 19:42 */@Html@Datapublic class SimpleBlogHtml {
@CssSelector(selector = ".oneline span") private List
title;}
package spider.common.bean;import lombok.SneakyThrows;import lombok.extern.slf4j.Slf4j;import spider.common.annotation.CssSelector;import spider.pojo.po.SimpleBlogHtml;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.processor.PageProcessor;import java.lang.reflect.Field;import java.util.List;/** * @Author lyr * @create 2020/6/23 18:50 * 处理器 */@Slf4jpublic class HtmlProcessor implements PageProcessor {
@SneakyThrows public void process(Page page) {
page.setCharset("UTF-8"); SimpleBlogHtml blogHtml = new SimpleBlogHtml(); for(Field field: SimpleBlogHtml.class.getDeclaredFields()) {
field.setAccessible(true); CssSelector cssSelector = field.getAnnotation(CssSelector.class); System.out.println(cssSelector); if(cssSelector!=null) {
String selector = cssSelector.selector(); List
result = page.getHtml().css(selector).xpath("//span/text()").all(); field.set(blogHtml,result); } } // log.info(page.getHtml().css("div").all().toString()); System.out.println(blogHtml.getTitle()); page.putField("html",blogHtml); } private final Site site = new Site(); {
site.setCharset("UTF-8"); } public Site getSite() {
return site; }}
package spider.common.bean;import lombok.extern.slf4j.Slf4j;import us.codecraft.webmagic.ResultItems;import us.codecraft.webmagic.Task;import us.codecraft.webmagic.pipeline.Pipeline;/** * @Author lyr * @create 2020/6/23 19:53 */@Slf4jpublic class DownloadPipeLine
implements Pipeline {
public void process(ResultItems resultItems, Task task) {
T data = resultItems.get("html"); log.info("data {}",data); }}
package spider;import spider.common.bean.DownloadPipeLine;import spider.common.bean.HtmlProcessor;import spider.pojo.po.SimpleBlogHtml;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.pipeline.FilePipeline;/** * @Author lyr * @create 2020/6/23 18:52 */public class Main {
public static void main(String[] args) {
Spider.create(new HtmlProcessor()) .addUrl("https://blog.csdn.net/qq_43923045") .addPipeline(new DownloadPipeLine
()) .run(); }}
你可能感兴趣的文章
多线程
查看>>
【Linux】Centos7 常用命令
查看>>
【Redis】Centos7下安装Redis
查看>>
【Redis】Centos7下搭建Redis集群
查看>>
【Redis】Centos7下搭建Redis集群——哨兵模式
查看>>
【Linux】本地ping不同VM虚拟机
查看>>
【SpringCloud】Hystrix
查看>>
快速阅读——《认知篇》
查看>>
【Asp.net】基本概念
查看>>
【Asp.net】Web服务器控件
查看>>
【Asp.net】内置对象
查看>>
C语言数据类型笔记 by STP
查看>>
C语言指针笔记 by STP
查看>>
CoreLocation笔记 by STP
查看>>
Application Transport Security has blocked a cleartext HTTP (http://) 解决方案
查看>>
The identity used to sign the executable is no longer valid.解决方案
查看>>
Xcode增加pch文件
查看>>
CocoaPods安装和使用笔记 by STP
查看>>
Could not find developer disk image-解决方案
查看>>
升级Xcode之后VVDocumenter-Xcode不能用的解决办法
查看>>