Java应用开源框架实现简易web搜索引擎

引言

应用 Java 的开源库，编写一个搜索引擎，这个引擎能爬取一个网站的内容。并根据网页内容进行深度爬取，获取所有相关的网页地址和内容，用户可以通过关键词，搜索所有相关的网址。

具体功能

(1) 用户可以指定爬取一个url对应的网页的内容。
(2) 对网页内容进行解析，并获取其中所有的url链接地址。
(3) 用户可以设定爬取深度，代表着从初始url对应的页面开始，可以爬取其中所有的url对应的网页内的url，以此类推。深度越大，能爬取到的网站越多。
(4) 对爬取到的url内容进行保存、建立索引。建立索引的内容是url地址本身，和url对应的网页标题。
(5) 用户可以通过关键词对网址进行搜索，找出有该关键词的url地址。
(6) 建立索引和搜索索引的过程能智能识别中文关键词，能对关键词进行分词操作。
(7) 用户可以指定保存索引的地址、初始url、爬取深度、进行搜索的关键词和最大匹配项。

开源框架

Lucene
Jsoup

源码

爬虫部分：Spider.java

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

									package webCrawler.Spider;

									import java.io.IOException;

									import java.util.ArrayList;

									import java.util.HashSet;

									import java.util.Scanner;

									import org.jsoup.Jsoup;

									import org.jsoup.nodes.Document;

									import org.jsoup.nodes.Element;

									import org.jsoup.select.Elements;

									import webCrawler.Index.BuildIndex;

									/**

									 * @author lannooo

									 */

									public class Spider {

									  ArrayList<String> URLs;

									  private String startURL;

									  private int digLevel;

									  /**

									   * @param startURL 爬虫的起始URL

									   * @param digLevel 爬取深度

									   */

									  public Spider(String startURL, int digLevel){

									    this.startURL = startURL;

									    this.digLevel = digLevel;

									    this.URLs = new ArrayList<>();

									  }

									  /**

									   * @param level 当前爬取的深度剩余

									   * @param arrayList 需要进行下一轮爬去的URL集

									   * @return 从一格url集爬取到的新的URL集

									   * @throws IOException

									   */

									  public ArrayList<String> getLevelURLs(int level, ArrayList<String> arrayList) 

									      throws IOException{

									    ArrayList<String> total = null;

									    if(level>0){      

									      total = new ArrayList<>();

									      for(String url: arrayList){

									        /*对于每个arrayList中的URL，首先解析其网页内容，并获得里面所有URL项*/

									        for(String each: getBareLinks(url)){

									          total.add(each);

									        }

									      }

									      /*用HashSet这个容器将total里面重复项删除*/

									      HashSet<String> hashSet = new HashSet<>(total);

									      total = new ArrayList<>(hashSet);

									    }

									    return total;

									  }

									  /**

									   * 从startURL开始，爬取所有相关URLs

									   * @throws IOException

									   */

									  public void getAll() throws IOException{

									    ArrayList<String> newURLs;

									    ArrayList<String> currentURLs = new ArrayList<>();

									    /*把startURL加入currentURLs这个列表中，从这个url开始爬*/

									    currentURLs.add(startURL);

									    for(int i=digLevel; i>0; i--){

									      /*

									       * 对于每一层，都要获取一次由这个url引申出去的url集

									       * 然后把当前集的已经爬去过的url加入到总的URL集中

									       * 最后newURLs作为新的需要进行深度爬取的集进入下一轮循环

									       */

									      System.out.println("Dig into level: " + (digLevel-i+1));

									      newURLs = getLevelURLs(i, currentURLs);

									      for(String each: currentURLs){

									        URLs.add(each);

									      }

									      currentURLs = newURLs;

									    }

									    for(String each:currentURLs){

									      URLs.add(each);

									    }

									    HashSet<String> hashSet = new HashSet<>(URLs);

									    URLs = new ArrayList<>(hashSet);

									  }

									  /**

									   * @param path 保存索引的路径

									   * @throws IOException

									   */

									  public void storeURLsAndInfo(String path) throws IOException{

									    BuildIndex build = new BuildIndex(path);

									    /* 把URLs中的所有url进行实际网页标题的爬取*/

									    for(String each:URLs){

									      String text = getLinkText(each);

									      if(text!=null){

									        build.addField("url", each);

									        build.addField("text", text);

									        /*将这一个entry加入索引中*/

									        build.pushIndex();

									      }

									    }

									    build.close();

									  }

									  /**

									   * @param url 需要获取网页标题的url

									   * @return 标题内容

									   * @throws IOException

									   */

									  public String getLinkText(String url) throws IOException{

									    Document document = null;

									    try {

									      /*用Jsoup进行连接，设置超时时间为3秒*/

									      document = Jsoup.connect(url).timeout(3000).get();

									    } catch (Exception e) {

									      System.out.println("[TIMEOUT]Get title of url:"+url);

									      return null;

									    }

									    String id="codetool">



	建立索引：BuildIndex.java

	
		
			
				?
			
				
					
						
							
								1
							
								2
							
								3
							
								4
							
								5
							
								6
							
								7
							
								8
							
								9
							
								10
							
								11
							
								12
							
								13
							
								14
							
								15
							
								16
							
								17
							
								18
							
								19
							
								20
							
								21
							
								22
							
								23
							
								24
							
								25
							
								26
							
								27
							
								28
							
								29
							
								30
							
								31
							
								32
							
								33
							
								34
							
								35
							
								36
							
								37
							
								38
							
								39
							
								40
							
								41
							
								42
							
								43
							
								44
							
								45
							
								46
							
								47
							
								48
							
								49
							
								50
							
								51
							
								52
							
								53
							
								54
							
								55
							
								56
							
								57
							
								58
							
								59
							
								60
							
								61
							
								62
							
								63
							
								64
							
								65
							
								66
							
								67
							
								68
							
								69
							
								70
							
								71
							
								72
							
								73
							
								74
							
								75
							
								76
							
								77
							
								78
							
								79
							
								80
							
								81
							
								82
							
								83
							
								84
							
								85
							
								86
							
								87
							
								88
							
								89
							
								90
							
								91
							
								92
						
						
							
								
									package webCrawler.Index;
								
									 
								
									import java.io.*;
								
									 
								
									import org.apache.lucene.analysis.Analyzer;
								
									import org.apache.lucene.document.Document;
								
									import org.apache.lucene.document.Field;
								
									import org.apache.lucene.document.TextField;
								
									import org.apache.lucene.index.IndexWriter;
								
									import org.apache.lucene.index.IndexWriterConfig;
								
									import org.apache.lucene.store.Directory;
								
									import org.apache.lucene.store.FSDirectory;
								
									import org.apache.lucene.util.Version;
								
									import org.wltea.analyzer.lucene.IKAnalyzer;
								
									 
								
									/**
								
									 * @author lannooo
								
									 *
								
									 */
								
									public class BuildIndex {
								
									  private File file;
								
									  private Directory directory;
								
									  private IndexWriter indexWriter;
								
									  private IndexWriterConfig config;
								
									  private Analyzer analyzer;
								
									  private Document document;
								
									 
								
									  /**
								
									   * @param path 建立索引的路径
								
									   */
								
									  public BuildIndex(String path) {
								
									    try {
								
									      file = new File(path);
								
									      directory = FSDirectory.open(file);
								
									      document = new Document();
								
									      analyzer = new IKAnalyzer();    /*中文分词工具类*/
								
									      config = new IndexWriterConfig(Version.LUCENE_4_10_0, analyzer);
								
									      indexWriter = new IndexWriter(directory, config);      
								
									 
								
									    } catch (Exception e) {
								
									      e.printStackTrace();
								
									    }
								
									  }
								
									 
								
									  /**
								
									   * @param fieldName 加入到document中的新的一项的名称
								
									   * @param fieldText 新的一项的内容
								
									   */
								
									  public void addField(String fieldName, String fieldText){
								
									    try{
								
									      Field field = new TextField(fieldName, fieldText, Field.Store.YES);
								
									      document.add(field);
								
									    }catch (Exception e) {
								
									      e.printStackTrace();
								
									    }
								
									  }
								
									 
								
									  /**
								
									   * 将document加入到索引中
								
									   */
								
									  public void pushIndex(){
								
									    try {
								
									      indexWriter.addDocument(document);
								
									      document = new Document();
								
									    } catch (Exception e) {
								
									      e.printStackTrace();
								
									    }
								
									  }
								
									 
								
									  /**
								
									   * 加入完整的一个document并保存到索引中
								
									   * @param url 加入的url地址
								
									   * @param text url对应的文本
								
									   */
								
									  public void addOneIndex(String url, String text){
								
									    this.addField("url", url);
								
									    this.addField("text", text);
								
									    this.pushIndex();
								
									  }
								
									 
								
									  /**
								
									   * 关闭索引写入
								
									   */
								
									  public void close(){
								
									    try {
								
									      indexWriter.close();
								
									    } catch (Exception e) {
								
									      e.printStackTrace();
								
									    }
								
									  }
								
									 
								
									}
							
						
					
				
			
		
	
	
		
			
	


	搜索索引

	
		
			
				?
			
				
					
						
							
								1
							
								2
							
								3
							
								4
							
								5
							
								6
							
								7
							
								8
							
								9
							
								10
							
								11
							
								12
							
								13
							
								14
							
								15
							
								16
							
								17
							
								18
							
								19
							
								20
							
								21
							
								22
							
								23
							
								24
							
								25
							
								26
							
								27
							
								28
							
								29
							
								30
							
								31
							
								32
							
								33
							
								34
							
								35
							
								36
							
								37
							
								38
							
								39
							
								40
							
								41
							
								42
							
								43
							
								44
							
								45
							
								46
							
								47
							
								48
							
								49
							
								50
							
								51
							
								52
							
								53
							
								54
							
								55
							
								56
							
								57
							
								58
							
								59
							
								60
							
								61
							
								62
							
								63
							
								64
							
								65
							
								66
							
								67
							
								68
							
								69
							
								70
							
								71
							
								72
							
								73
							
								74
							
								75
							
								76
							
								77
							
								78
							
								79
							
								80
							
								81
							
								82
							
								83
							
								84
							
								85
							
								86
							
								87
							
								88
							
								89
							
								90
							
								91
							
								92
							
								93
							
								94
							
								95
							
								96
							
								97
							
								98
							
								99
							
								100
							
								101
							
								102
							
								103
							
								104
							
								105
							
								106
							
								107
							
								108
						
						
							
								
									package webCrawler.Index;
								
									 
								
									import java.io.File;
								
									import java.util.Scanner;
								
									 
								
									import org.apache.lucene.analysis.Analyzer;
								
									import org.apache.lucene.document.Document;
								
									import org.apache.lucene.index.DirectoryReader;
								
									import org.apache.lucene.queryparser.classic.QueryParser;
								
									import org.apache.lucene.search.IndexSearcher;
								
									import org.apache.lucene.search.Query;
								
									import org.apache.lucene.search.ScoreDoc;
								
									import org.apache.lucene.search.TopDocs;
								
									import org.apache.lucene.store.FSDirectory;
								
									import org.wltea.analyzer.lucene.IKAnalyzer;
								
									 
								
									/**
								
									 * @author lannooo
								
									 *
								
									 */
								
									public class SearchIndex {
								
									  private IndexSearcher indexSearcher;
								
									  private Analyzer analyzer;
								
									  private QueryParser parser;
								
									  private Query query;
								
									  private TopDocs hits;
								
									  private DirectoryReader reader;
								
									 
								
									  /**
								
									   * @param path 进行索引搜索的路径
								
									   */
								
									  public SearchIndex(String path){
								
									    try {
								
									      reader = DirectoryReader.open(FSDirectory.open(new File(path)));
								
									      indexSearcher = new IndexSearcher(reader);
								
									      analyzer = new IKAnalyzer();
								
									    } catch (Exception e) {
								
									      e.printStackTrace();
								
									    }
								
									  }
								
									 
								
									  /**
								
									   * @param fieldName 搜索的域名称
								
									   * @param text 搜索的内容
								
									   * @param matchNumber 最大匹配项数
								
									   * @return 搜索到的最大匹配数
								
									   */
								
									  public int search(String fieldName, String text, int matchNumber){
								
									    try {
								
									      parser = new QueryParser(fieldName, analyzer);
								
									      query = parser.parse(text);
								
									      hits = indexSearcher.search(query, matchNumber);
								
									 
								
									      return hits.totalHits;
								
									    } catch (Exception e) {
								
									      e.printStackTrace();
								
									    }
								
									    return -1;
								
									  }
								
									  /**
								
									   * 打印所有的匹配项
								
									   */
								
									  public void printHits(){
								
									    try{
								
									      System.out.println("Total hits number:"+hits.totalHits);
								
									      for(ScoreDoc doc: hits.scoreDocs){
								
									        Document document = indexSearcher.doc(doc.doc);
								
									        System.out.println(document.get("url"));
								
									        System.out.println(document.get("text"));
								
									      }
								
									      reader.close();
								
									    }catch (Exception e) {
								
									      e.printStackTrace();
								
									    }
								
									  }
								
									  public static void main(String[] args) {
								
									    /*输入关键词*/
								
									    Scanner in = new Scanner(System.in);
								
									    System.out.println("Enter path of the index:");
								
									    String path = in.nextLine().trim();
								
									    while(path.length()==0){
								
									      System.out.println("Enter path of the index:");
								
									      path = in.nextLine().trim();
								
									    }
								
									 
								
									    System.out.println("Enter max hit number:");
								
									    int max = in.nextInt();
								
									    while(max<0){
								
									      System.out.println("Enter max hit number:");
								
									      max = in.nextInt();
								
									    }
								
									    in.nextLine();
								
									    System.out.print("Search>>> ");
								
									    String text = in.nextLine().trim();
								
									    /*循环读入用户的关键词，如果是q则退出，长度为0也退出*/
								
									    while(!text.equals("q")){
								
									      if(text.length()>0){
								
									        SearchIndex search = new SearchIndex(path);
								
									        int hits = search.search("text", text, max);
								
									        if(hits!=-1){
								
									          search.printHits();
								
									        }
								
									      }
								
									      System.out.print("Search>>> ");
								
									      text = in.nextLine().trim();
								
									    }
								
									  }
								
									}
							
						
					
				
			
		
	
	
		
			
	


	UI界面（这里为了方便只是命令行的形式，可以根据需求写一个GUI界面）

	
		
			
				?
			
				
					
						
							
								1
							
								2
							
								3
							
								4
							
								5
							
								6
							
								7
							
								8
							
								9
							
								10
							
								11
							
								12
							
								13
							
								14
							
								15
							
								16
							
								17
							
								18
							
								19
							
								20
							
								21
							
								22
							
								23
							
								24
							
								25
							
								26
							
								27
							
								28
						
						
							
								
									package webCrawler.UI;
								
									 
								
									import java.util.Scanner;
								
									 
								
									import webCrawler.Index.SearchIndex;
								
									 
								
									/**
								
									 * @author lannooo
								
									 *
								
									 */
								
									public class UI {
								
									  public static void main(String[] args) {
								
									    /*输入关键词*/
								
									    Scanner in = new Scanner(System.in);
								
									    System.out.print("Search>>> ");
								
									    String text = in.nextLine().trim();
								
									    /*对于用户的关键词，如果是q则退出，长度为0也退出*/
								
									    while(!text.equals("q") && text.length()>0){
								
									      SearchIndex search = new SearchIndex("d:/index-spider2");
								
									      int hits = search.search("text", text, 20);
								
									      if(hits!=-1){
								
									        search.printHits();
								
									      }
								
									      System.out.print("Search>>> ");
								
									      text = in.nextLine().trim();
								
									    }
								
									  }
								
									}
							
						
					
				
			
		
	
	
		
			
	


	以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持服务器之家。

	原文链接：http://blog.csdn.net/qq_22187919/article/details/60466006
标签：Java Web 搜索引擎 
相关文章
Java京东面试题之为什么HashMap线程不安全2022-03-11
Java面试为何阿里强制要求不在foreach里执行删除操2022-03-11
图解Java排序算法之希尔排序2022-03-11
图解Java排序算法之快速排序的三数取中法2022-03-11
图解Java排序算法之堆排序2022-03-11
java中TreeMap集合的常用方法详解2022-03-11
热门资讯
2022年最旺的微信头像大全 微信头像2022年最新版图片 2022-01-10
蜘蛛侠3英雄无归3正片免费播放 蜘蛛侠3在线观看免费高清完整 2021-08-24
背刺什么意思 网络词语背刺是什么梗 2020-05-22
yue是什么意思 网络流行语yue了是什么梗 2020-10-11
暖暖日本高清免费中文 暖暖在线观看免费完整版韩国 2021-05-08
返回顶部
首页 l 电脑版 l 网站标签 l 网站地图