用两个正则表达式:
1、匹配html中img标签的正则:<img.*src=(.*?)[^>]*?>
2、匹配img标签中得src中http路径的正则:http:\"?(.*?)(\"|>|\\s+)
实现:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
package org.swinglife.main; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /*** * java抓取网络图片 * @author swinglife * */ public class CatchImage { // 地址 private static final String URL = "http://www.zzvips.com" ; // 编码 private static final String ECODING = "UTF-8" ; // 获取img标签正则 private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>" ; // 获取src路径的正则 private static final String IMGSRC_REG = "http:\"?(.*?)(\"|>|\\s+)" ; public static void main(String[] args) throws Exception { CatchImage cm = new CatchImage(); //获得html文本内容 String HTML = cm.getHTML(URL); //获取图片标签 List<String> imgUrl = cm.getImageUrl(HTML); //获取图片src地址 List<String> imgSrc = cm.getImageSrc(imgUrl); //下载图片 cm.Download(imgSrc); } /*** * 获取HTML内容 * * @param url * @return * @throws Exception */ private String getHTML(String url) throws Exception { URL uri = new URL(url); URLConnection connection = uri.openConnection(); InputStream in = connection.getInputStream(); byte [] buf = new byte [ 1024 ]; int length = 0 ; StringBuffer sb = new StringBuffer(); while ((length = in.read(buf, 0 , buf.length)) > 0 ) { sb.append( new String(buf, ECODING)); } in.close(); return sb.toString(); } /*** * 获取ImageUrl地址 * * @param HTML * @return */ private List<String> getImageUrl(String HTML) { Matcher matcher = Pattern.compile(IMGURL_REG).matcher(HTML); List<String> listImgUrl = new ArrayList<String>(); while (matcher.find()) { listImgUrl.add(matcher.group()); } return listImgUrl; } /*** * 获取ImageSrc地址 * * @param listImageUrl * @return */ private List<String> getImageSrc(List<String> listImageUrl) { List<String> listImgSrc = new ArrayList<String>(); for (String image : listImageUrl) { Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image); while (matcher.find()) { listImgSrc.add(matcher.group().substring( 0 , matcher.group().length() - 1 )); } } return listImgSrc; } /*** * 下载图片 * * @param listImgSrc */ private void Download(List<String> listImgSrc) { try { for (String url : listImgSrc) { String imageName = url.substring(url.lastIndexOf( "/" ) + 1 , url.length()); URL uri = new URL(url); InputStream in = uri.openStream(); FileOutputStream fo = new FileOutputStream( new File(imageName)); byte [] buf = new byte [ 1024 ]; int length = 0 ; System.out.println( "开始下载:" + url); while ((length = in.read(buf, 0 , buf.length)) != - 1 ) { fo.write(buf, 0 , length); } in.close(); fo.close(); System.out.println(imageName + "下载完成" ); } } catch (Exception e) { System.out.println( "下载失败" ); } } } |
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:http://blog.csdn.net/swingpyzf/article/details/16338903