簡單小爬蟲html
//請求連接,獲取網頁源碼 public String sendGet(String url) { String result = ""; StringBuffer sb = new StringBuffer(); BufferedReader in = null; try { String urlNameString = url; System.out.println(urlNameString); URL realUrl = new URL(urlNameString); // 打開和URL之間的鏈接 URLConnection connection = realUrl.openConnection(); // 設置通用的請求屬性 connection.setRequestProperty("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"); connection.setRequestProperty("Content-type", "text/html;charset=gbk"); connection.setRequestProperty("upgrade-insecure-requests", "1"); connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.9"); connection.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"); // connection connection.connect(); // 獲取全部響應頭字段 Map<String, List<String>> map = connection.getHeaderFields(); in = new BufferedReader(new InputStreamReader( connection.getInputStream(),"GBK")); String lines; while ((lines = in.readLine()) != null) { lines=getUTF8StringFromGBKString(lines); sb.append(lines); //System.out.println(line); } } catch (Exception e) { System.out.println("發送GET請求出現異常!" + e); e.printStackTrace(); } // 使用finally塊來關閉輸入流 finally { try { if (in != null) { in.close(); } } catch (Exception e2) { e2.printStackTrace(); } } return sb.toString(); }
1 //對抓取的數據存入磁盤中,進行全文搜索 2 public class LuceneManager { 3 4 public static List<shop> Select(String name,int tid,int uid) throws IOException { 5 6 IndexSearcher indexSearcher =new IndexSearcher(LuceneUtils.getDirectory()); 7 8 //建立一個布爾查詢對象 9 10 BooleanQuery query = new BooleanQuery(); 11 12 //建立第一個查詢條件 13 if (name!=null&&name.length()!=0) { 14 int maxEdits = 100; //相同的前綴長度 15 // Query query = new FuzzyQuery(term,maxEdits,prefixLength); 16 char[] tc=name.toCharArray(); 17 for (int i = 0; i < tc.length; i++) { 18 Term term = new Term("shopName","*"+tc[i]+"*"); 19 Query shopnameQuery=new WildcardQuery(term); 20 query.add(shopnameQuery, Occur.MUST); 21 } 22 // SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>"); 23 // 24 // Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query)); 25 // highlighter.setTextFragmenter(new SimpleFragmenter(text.length())); 26 27 } 28 else { 29 30 } 31 32 33 Query tidQuery = new TermQuery(new Term("tid",String.valueOf(tid))); 34 Query uidQuery = new TermQuery(new Term("uid",String.valueOf(uid))); 35 36 //組合查詢條件 37 38 39 40 query.add(tidQuery, Occur.MUST); 41 query.add(uidQuery, Occur.MUST); 42 //執行查詢 43 44 TopDocs topDocs = indexSearcher.search(query, 100); 45 List<shop> list=new ArrayList<shop>(); 46 //獲取符合條件的編號 47 System.out.println(topDocs.scoreDocs.length); 48 for (int i = 0; i < topDocs.scoreDocs.length; i++) { 49 ScoreDoc scoreDoc = topDocs.scoreDocs[i]; 50 int no = scoreDoc.doc; 51 //用indexSearcher對象去索引庫中查詢編號對應的Document對象 52 Document document = indexSearcher.doc(no); 53 //將Document對象中的全部屬性取出,再封裝回JavaBean對象中去 54 shop user = (shop) LuceneUtils.Document2JavaBean(document, shop.class); 55 list.add(user); 56 System.out.println(user.getiD()+":"+user.getShopName()+":"+user.getShopPic()+":"+user.getShopPrice()+":"+user.getShopSalesvolume()+":"+user.getStoreName()+":"+user.getTid()+":"+user.getUid()); 57 } 58 return list; 59 } 60 61 public static void createIndexDB(shop userShop) throws Exception { 62 //把數據填充到JavaBean對象中 63 // User user = new User("1", "鍾福成23", "將來的程序員3"); 64 //shop userShop=new shop("4", "小米9", "2000.0", "華爲旗艦店", "1q23", "1","2","1000"); 65 //建立Document對象【導入的是Lucene包下的Document對象】 66 Document document = new Document(); 67 //將JavaBean對象全部的屬性值,均放到Document對象中去,屬性名能夠和JavaBean相同或不一樣 68 /** 69 * 向Document對象加入一個字段 70 * 參數一:字段的關鍵字 71 * 參數二:字符的值 72 * 參數三:是否要存儲到原始記錄表中 73 * YES表示是 74 * NO表示否 75 * 參數四:是否須要將存儲的數據拆分到詞彙表中 76 * ANALYZED表示拆分 77 * NOT_ANALYZED表示不拆分 78 * 79 * */ 80 // document.add(new Field("id", user, Field.Store.YES, Field.Index.ANALYZED)); 81 // document.add(new Field("userName", user.getUserName(), Field.Store.YES, Field.Index.ANALYZED)); 82 // document.add(new Field("sal", user.getSal(), Field.Store.YES, Field.Index.ANALYZED)); 83 document.add(new Field("iD", userShop.getiD(), Field.Store.YES, Field.Index.ANALYZED)); 84 document.add(new Field("shopName", userShop.getShopName(), Field.Store.YES, Field.Index.ANALYZED)); 85 document.add(new Field("shopPic", userShop.getShopPic(), Field.Store.YES, Field.Index.ANALYZED)); 86 document.add(new Field("shopPrice", userShop.getShopPrice(), Field.Store.YES, Field.Index.ANALYZED)); 87 document.add(new Field("shopSalesvolume", userShop.getShopSalesvolume(), Field.Store.YES, Field.Index.ANALYZED)); 88 document.add(new Field("storeName", userShop.getStoreName(), Field.Store.YES, Field.Index.ANALYZED)); 89 document.add(new Field("tid", userShop.getTid(), Field.Store.YES, Field.Index.ANALYZED)); 90 document.add(new Field("uid", userShop.getUid(), Field.Store.YES, Field.Index.ANALYZED)); 91 //建立IndexWriter對象 92 //目錄指定爲E:/createIndexDB 93 Directory directory = FSDirectory.open(new File("D:/createIndexDB")); 94 95 //使用標準的分詞算法對原始記錄表進行拆分 96 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); 97 98 //LIMITED默認是1W個 99 IndexWriter.MaxFieldLength maxFieldLength = IndexWriter.MaxFieldLength.LIMITED; 100 /** 101 * IndexWriter將咱們的document對象寫到硬盤中 102 * 103 * 參數一:Directory d,寫到硬盤中的目錄路徑是什麼 104 * 參數二:Analyzer a, 以何種算法來對document中的原始記錄表數據進行拆分紅詞彙表 105 * 參數三:MaxFieldLength mfl 最多將文本拆分出多少個詞彙 106 * 107 * */ 108 IndexWriter indexWriter = new IndexWriter(directory, analyzer, maxFieldLength); 109 Term id=new Term("iD",String.valueOf(userShop.getiD())); 110 // Term id=new Term("id",String.valueOf(user.getId())); 111 indexWriter.updateDocument(id,document); 112 //將Document對象經過IndexWriter對象寫入索引庫中 113 114 indexWriter.optimize(); 115 116 //設置合併因子爲3,每當有3個cfs文件,就合併 117 indexWriter.setMergeFactor(3); 118 //關閉IndexWriter對象 119 indexWriter.close(); 120 } 121 122 public static void DeleteByID(int id) throws IOException { 123 //建立Document對象【導入的是Lucene包下的Document對象】 124 Document document = new Document(); 125 126 //建立IndexWriter對象 127 //目錄指定爲E:/createIndexDB 128 Directory directory = FSDirectory.open(new File("D:/createIndexDB")); 129 130 //使用標準的分詞算法對原始記錄表進行拆分 131 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); 132 133 //LIMITED默認是1W個 134 IndexWriter.MaxFieldLength maxFieldLength = IndexWriter.MaxFieldLength.LIMITED; 135 /** 136 * IndexWriter將咱們的document對象寫到硬盤中 137 * 138 * 參數一:Directory d,寫到硬盤中的目錄路徑是什麼 139 * 參數二:Analyzer a, 以何種算法來對document中的原始記錄表數據進行拆分紅詞彙表 140 * 參數三:MaxFieldLength mfl 最多將文本拆分出多少個詞彙 141 * 142 * */ 143 IndexWriter indexWriter = new IndexWriter(directory, analyzer, maxFieldLength); 144 indexWriter.deleteDocuments(new Term("iD", String.valueOf(id))); 145 indexWriter.optimize(); 146 147 //設置合併因子爲3,每當有3個cfs文件,就合併 148 indexWriter.setMergeFactor(3); 149 //關閉IndexWriter對象 150 indexWriter.close(); 151 } 152 153 public static void findIndexDB(String nameString) throws Exception { 154 155 //建立IndexSearcher對象 156 IndexSearcher indexSearcher = new IndexSearcher(LuceneUtils.getDirectory()); 157 //建立QueryParser對象 158 QueryParser queryParser = new QueryParser(Version.LUCENE_30, "shopName", LuceneUtils.getAnalyzer()); 159 //給出要查詢的關鍵字 160 String keyWords = nameString; 161 //建立Query對象來封裝關鍵字 162 Query query = queryParser.parse(keyWords); 163 //用IndexSearcher對象去索引庫中查詢符合條件的前100條記錄,不足100條記錄的以實際爲準 164 TopDocs topDocs = indexSearcher.search(query, 100); 165 //獲取符合條件的編號 166 System.out.println(topDocs.scoreDocs.length); 167 for (int i = 0; i < topDocs.scoreDocs.length; i++) { 168 ScoreDoc scoreDoc = topDocs.scoreDocs[i]; 169 int no = scoreDoc.doc; 170 //用indexSearcher對象去索引庫中查詢編號對應的Document對象 171 Document document = indexSearcher.doc(no); 172 //將Document對象中的全部屬性取出,再封裝回JavaBean對象中去 173 shop user = (shop) LuceneUtils.Document2JavaBean(document, shop.class); 174 System.out.println(user.getiD()+":"+user.getShopName()+":"+user.getShopPic()+":"+user.getShopPrice()+":"+user.getShopSalesvolume()+":"+user.getStore15:49:00Name()+":"+user.getTid()+":"+user.getUid()); 177 } 178 } 179 180 }
須要源碼的,能夠私信我!QQ:2748434806前端