JAVA--Reptile

簡單小爬蟲html

項目中技術:

  • Jsoup (解析網頁:請求網址返回網頁信息,Jsoup獲取對應節點的數據)
  • Lucene(搜索引擎:對抓取的數據進行搜索,相比較而言,比數據庫查詢要快的多!)
  • 前端樣式用的是BootScript
  • 前端使用Ajax請求數據,後臺使用Servelt處理請求,先後臺傳遞數據格式爲Json
  • 數據庫鏈接使用JDBC
  • 存儲的數據庫是:SqlServer

項目的環境:

  • 運行環境:apache-tomcat-7.0.94
  • 開發環境:JDK8.0
  • 開發工具:Eclipse

效果展現:

       

       

       

 

代碼展現:

//請求連接,獲取網頁源碼 
public String sendGet(String url) {
        String result = "";
        StringBuffer sb = new StringBuffer();
        BufferedReader in = null;
        try {
        String urlNameString = url;
    
        System.out.println(urlNameString);
        URL realUrl = new URL(urlNameString);
        // 打開和URL之間的鏈接
        URLConnection connection = realUrl.openConnection();
        // 設置通用的請求屬性
        connection.setRequestProperty("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
        connection.setRequestProperty("Content-type", "text/html;charset=gbk");
        connection.setRequestProperty("upgrade-insecure-requests", "1");
        connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.9");

        connection.setRequestProperty("user-agent",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36");
        // connection

        connection.connect();

        // 獲取全部響應頭字段
        Map<String, List<String>> map = connection.getHeaderFields();

        in = new BufferedReader(new InputStreamReader(
        connection.getInputStream(),"GBK"));
        String lines;

        while ((lines = in.readLine()) != null) {
            lines=getUTF8StringFromGBKString(lines); 
            
              sb.append(lines);
        //System.out.println(line);
        }
        
        } catch (Exception e) {
        System.out.println("發送GET請求出現異常!" + e);
        e.printStackTrace();
        }
        // 使用finally塊來關閉輸入流
        finally {
        try {
        if (in != null) {
        in.close();
        }
        } catch (Exception e2) {
        e2.printStackTrace();
        }
        }
        
           return sb.toString();
    }

 

  1 //對抓取的數據存入磁盤中,進行全文搜索
  2 public class LuceneManager {
  3     
  4        public static List<shop> Select(String name,int tid,int uid) throws IOException {
  5 
  6             IndexSearcher indexSearcher =new IndexSearcher(LuceneUtils.getDirectory());
  7 
  8             //建立一個布爾查詢對象
  9 
 10             BooleanQuery query = new BooleanQuery();
 11             
 12             //建立第一個查詢條件
 13 if (name!=null&&name.length()!=0) {
 14     int maxEdits = 100; //相同的前綴長度
 15 //  Query query = new FuzzyQuery(term,maxEdits,prefixLength);
 16     char[] tc=name.toCharArray();
 17     for (int i = 0; i < tc.length; i++) {
 18         Term term = new Term("shopName","*"+tc[i]+"*");
 19         Query shopnameQuery=new WildcardQuery(term);
 20         query.add(shopnameQuery, Occur.MUST);
 21     }
 22 //    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");
 23 //
 24 //    Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
 25 //    highlighter.setTextFragmenter(new SimpleFragmenter(text.length()));
 26 
 27 }
 28 else {
 29     
 30 }
 31             
 32 
 33             Query tidQuery = new TermQuery(new Term("tid",String.valueOf(tid)));
 34             Query uidQuery = new TermQuery(new Term("uid",String.valueOf(uid)));
 35 
 36             //組合查詢條件
 37 
 38         
 39 
 40             query.add(tidQuery, Occur.MUST);
 41             query.add(uidQuery, Occur.MUST);
 42             //執行查詢
 43 
 44              TopDocs topDocs = indexSearcher.search(query, 100);
 45              List<shop> list=new ArrayList<shop>();
 46                 //獲取符合條件的編號
 47              System.out.println(topDocs.scoreDocs.length);
 48                 for (int i = 0; i < topDocs.scoreDocs.length; i++) {
 49                     ScoreDoc scoreDoc = topDocs.scoreDocs[i];
 50                     int no = scoreDoc.doc;
 51                     //用indexSearcher對象去索引庫中查詢編號對應的Document對象
 52                     Document document = indexSearcher.doc(no);
 53                     //將Document對象中的全部屬性取出,再封裝回JavaBean對象中去
 54                    shop user = (shop) LuceneUtils.Document2JavaBean(document, shop.class);
 55                    list.add(user);
 56                   System.out.println(user.getiD()+":"+user.getShopName()+":"+user.getShopPic()+":"+user.getShopPrice()+":"+user.getShopSalesvolume()+":"+user.getStoreName()+":"+user.getTid()+":"+user.getUid());
 57                 }
 58 return list;
 59             }
 60 
 61        public static void createIndexDB(shop userShop) throws Exception {
 62                    //把數據填充到JavaBean對象中
 63            // User user = new User("1", "鍾福成23", "將來的程序員3");
 64            //shop userShop=new shop("4", "小米9", "2000.0", "華爲旗艦店", "1q23", "1","2","1000");
 65                 //建立Document對象【導入的是Lucene包下的Document對象】
 66                 Document document = new Document();
 67                 //將JavaBean對象全部的屬性值,均放到Document對象中去,屬性名能夠和JavaBean相同或不一樣
 68                 /**
 69                  * 向Document對象加入一個字段
 70                  * 參數一:字段的關鍵字
 71                  * 參數二:字符的值
 72                  * 參數三:是否要存儲到原始記錄表中
 73                  *      YES表示是
 74                  *      NO表示否
 75                  * 參數四:是否須要將存儲的數據拆分到詞彙表中
 76                  *      ANALYZED表示拆分
 77                  *      NOT_ANALYZED表示不拆分
 78                  *
 79                  * */
 80 //                document.add(new Field("id", user, Field.Store.YES, Field.Index.ANALYZED));
 81 //                document.add(new Field("userName", user.getUserName(), Field.Store.YES, Field.Index.ANALYZED));
 82 //                document.add(new Field("sal", user.getSal(), Field.Store.YES, Field.Index.ANALYZED));
 83                document.add(new Field("iD", userShop.getiD(), Field.Store.YES, Field.Index.ANALYZED));
 84                 document.add(new Field("shopName", userShop.getShopName(), Field.Store.YES, Field.Index.ANALYZED));
 85                document.add(new Field("shopPic", userShop.getShopPic(), Field.Store.YES, Field.Index.ANALYZED));
 86                document.add(new Field("shopPrice", userShop.getShopPrice(), Field.Store.YES, Field.Index.ANALYZED));
 87                document.add(new Field("shopSalesvolume", userShop.getShopSalesvolume(), Field.Store.YES, Field.Index.ANALYZED));
 88                document.add(new Field("storeName", userShop.getStoreName(), Field.Store.YES, Field.Index.ANALYZED));
 89                document.add(new Field("tid", userShop.getTid(), Field.Store.YES, Field.Index.ANALYZED));
 90                document.add(new Field("uid", userShop.getUid(), Field.Store.YES, Field.Index.ANALYZED));
 91                 //建立IndexWriter對象
 92                 //目錄指定爲E:/createIndexDB
 93                 Directory directory = FSDirectory.open(new File("D:/createIndexDB"));
 94 
 95                 //使用標準的分詞算法對原始記錄表進行拆分
 96                 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
 97 
 98                 //LIMITED默認是1W個
 99                 IndexWriter.MaxFieldLength maxFieldLength = IndexWriter.MaxFieldLength.LIMITED;
100                 /**
101                  * IndexWriter將咱們的document對象寫到硬盤中
102                  *
103                  * 參數一:Directory d,寫到硬盤中的目錄路徑是什麼
104                  * 參數二:Analyzer a, 以何種算法來對document中的原始記錄表數據進行拆分紅詞彙表
105                  * 參數三:MaxFieldLength mfl 最多將文本拆分出多少個詞彙
106                  *
107                  * */
108                 IndexWriter indexWriter = new IndexWriter(directory, analyzer, maxFieldLength);
109                 Term id=new Term("iD",String.valueOf(userShop.getiD()));
110                // Term id=new Term("id",String.valueOf(user.getId()));
111                 indexWriter.updateDocument(id,document);
112                 //將Document對象經過IndexWriter對象寫入索引庫中
113             
114                 indexWriter.optimize();
115 
116                 //設置合併因子爲3,每當有3個cfs文件,就合併
117                 indexWriter.setMergeFactor(3);
118                 //關閉IndexWriter對象
119                 indexWriter.close();
120             }
121 
122        public static void DeleteByID(int id) throws IOException {
123                  //建立Document對象【導入的是Lucene包下的Document對象】
124                 Document document = new Document();
125 
126                 //建立IndexWriter對象
127                 //目錄指定爲E:/createIndexDB
128                 Directory directory = FSDirectory.open(new File("D:/createIndexDB"));
129 
130                 //使用標準的分詞算法對原始記錄表進行拆分
131                 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
132 
133                 //LIMITED默認是1W個
134                 IndexWriter.MaxFieldLength maxFieldLength = IndexWriter.MaxFieldLength.LIMITED;
135                 /**
136                  * IndexWriter將咱們的document對象寫到硬盤中
137                  *
138                  * 參數一:Directory d,寫到硬盤中的目錄路徑是什麼
139                  * 參數二:Analyzer a, 以何種算法來對document中的原始記錄表數據進行拆分紅詞彙表
140                  * 參數三:MaxFieldLength mfl 最多將文本拆分出多少個詞彙
141                  *
142                  * */
143                 IndexWriter indexWriter = new IndexWriter(directory, analyzer, maxFieldLength);
144                 indexWriter.deleteDocuments(new Term("iD", String.valueOf(id)));
145                 indexWriter.optimize();
146 
147                 //設置合併因子爲3,每當有3個cfs文件,就合併
148                 indexWriter.setMergeFactor(3);
149                 //關閉IndexWriter對象
150                 indexWriter.close();
151             }
152          
153        public static void findIndexDB(String nameString) throws Exception {
154 
155                 //建立IndexSearcher對象
156                 IndexSearcher indexSearcher = new IndexSearcher(LuceneUtils.getDirectory());
157                 //建立QueryParser對象
158                 QueryParser queryParser = new QueryParser(Version.LUCENE_30, "shopName", LuceneUtils.getAnalyzer());
159                 //給出要查詢的關鍵字
160                 String keyWords = nameString;
161                 //建立Query對象來封裝關鍵字
162                 Query query = queryParser.parse(keyWords);
163                 //用IndexSearcher對象去索引庫中查詢符合條件的前100條記錄,不足100條記錄的以實際爲準
164                 TopDocs topDocs = indexSearcher.search(query, 100);
165                 //獲取符合條件的編號
166              System.out.println(topDocs.scoreDocs.length);
167                 for (int i = 0; i < topDocs.scoreDocs.length; i++) {
168                     ScoreDoc scoreDoc = topDocs.scoreDocs[i];
169                     int no = scoreDoc.doc;
170                     //用indexSearcher對象去索引庫中查詢編號對應的Document對象
171                     Document document = indexSearcher.doc(no);
172                     //將Document對象中的全部屬性取出,再封裝回JavaBean對象中去
173                     shop user = (shop) LuceneUtils.Document2JavaBean(document, shop.class);
174                        System.out.println(user.getiD()+":"+user.getShopName()+":"+user.getShopPic()+":"+user.getShopPrice()+":"+user.getShopSalesvolume()+":"+user.getStore15:49:00Name()+":"+user.getTid()+":"+user.getUid());
177                 }
178             }
179 
180 }

 須要源碼的,能夠私信我!QQ:2748434806前端

相關文章
相關標籤/搜索
本站公眾號
   歡迎關注本站公眾號,獲取更多信息