ç¬è«å ¥é¨ æåä¸ä¸ªJavaç¬è«
æ¬æå 容 æ¶æºäº ç½å èå¸ç ä¹¦ç± << èªå·±å¨æåç½ç»ç¬è«ä¸ä¹¦ >> ;
æ¬æå°ä»ç» 1: ç½ç»ç¬è«çæ¯åä»ä¹ç? 2: æå¨åä¸ä¸ªç®åçç½ç»ç¬è«;
1: ç½ç»ç¬è«æ¯åä»ä¹ç? ä»ç主è¦å·¥ä½å°±æ¯ è·æ®æå®çurlå°å å»åé请æ±,è·å¾ååº, ç¶åè§£æååº , 䏿¹é¢ä»ååºä¸æ¥æ¾åºæ³è¦æ¥æ¾çæ°æ®,å¦ä¸æ¹é¢ä»ååºä¸è§£æåºæ°çURLè·¯å¾,
ç¶åç»§ç»è®¿é®,ç»§ç»è§£æ;ç»§ç»æ¥æ¾éè¦çæ°æ®åç»§ç»è§£æåºæ°çURLè·¯å¾ .
è¿å°±æ¯ç½ç»ç¬è«ä¸»è¦å¹²çå·¥ä½. ä¸é¢æ¯æµç¨å¾:
éè¿ä¸é¢çæµç¨å¾ è½å¤§æ¦äºè§£å° ç½ç»ç¬è« å¹²äºåªäºæ´» ,æ ¹æ®è¿äº ä¹å°±è½è®¾è®¡åºä¸ä¸ªç®åçç½ç»ç¬è«åºæ¥.
ä¸ä¸ªç®åçç¬è« å¿ éçåè½:
1: åé请æ±åè·åååºçåè½ ;
2: è§£æååºçåè½ ;
3: 对 è¿æ»¤åºçæ°æ® è¿è¡åå¨ çåè½ ;
4: 对解æåºæ¥çURLè·¯å¾ å¤ççåè½ ;
ä¸é¢æ¯å ç»æ:
ä¸é¢å°±ä¸ä»£ç ï¼
RequestAndResponseTool ç±»ï¼ ä¸»è¦æ¹æ³ï¼ åéè¯·æ± è¿åååº å¹¶æ ååº å°è£ æ page ç±» ;
package com.etoak.crawl.page;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import java.io.IOException;
public class RequestAndResponseTool {
public static Page sendRequstAndGetResponse(String url) {
Page page = null;
// 1.çæ HttpClinet å¯¹è±¡å¹¶è®¾ç½®åæ°
HttpClient httpClient = new HttpClient();
// 设置 HTTP è¿æ¥è¶
æ¶ 5s
httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(5000);
// 2.çæ GetMethod å¯¹è±¡å¹¶è®¾ç½®åæ°
GetMethod getMethod = new GetMethod(url);
// 设置 get 请æ±è¶
æ¶ 5s
getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 5000);
// 设置请æ±éè¯å¤ç
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());
// 3.æ§è¡ HTTP GET 请æ±
try {
int statusCode = httpClient.executeMethod(getMethod);
// å¤æè®¿é®çç¶æç
if (statusCode != HttpStatus.SC_OK) {
System.err.println("Method failed: " + getMethod.getStatusLine());
}
// 4.å¤ç HTTP ååºå
容
byte[] responseBody = getMethod.getResponseBody();// 读å为åè æ°ç»
String contentType = getMethod.getResponseHeader("Content-Type").getValue(); // å¾å°å½åè¿åç±»å
page = new Page(responseBody,url,contentType); //å°è£
æä¸ºé¡µé¢
} catch (HttpException e) {
// åçè´å½çå¼å¸¸ï¼å¯è½æ¯åè®®ä¸å¯¹æè
è¿åçå
容æé®é¢
System.out.println("Please check your provided http address!");
e.printStackTrace();
} catch (IOException e) {
// åçç½ç»å¼å¸¸
e.printStackTrace();
} finally {
// éæ¾è¿æ¥
getMethod.releaseConnection();
}
return page;
}
}
page ç±»ï¼ ä¸»è¦ä½ç¨ï¼ ä¿åååºçç¸å ³å 容 坹夿ä¾è®¿é®æ¹æ³ï¼
package com.etoak.crawl.page;
import com.etoak.crawl.util.CharsetDetector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.UnsupportedEncodingException;
/*
* page
* 1: ä¿åè·åå°çååºçç¸å
³å
容;
* */
public class Page {
private byte[] content ;
private String html ; //ç½é¡µæºç å符串
private Document doc ;//ç½é¡µDomææ¡£
private String charset ;//å符ç¼ç
private String url ;//urlè·¯å¾
private String contentType ;// å
容类å
public Page(byte[] content , String url , String contentType){
this.content = content ;
this.url = url ;
this.contentType = contentType ;
}
public String getCharset() {
return charset;
}
public String getUrl(){return url ;}
public String getContentType(){ return contentType ;}
public byte[] getContent(){ return content ;}
/**
* è¿åç½é¡µçæºç å符串
*
* @return ç½é¡µçæºç å符串
*/
public String getHtml() {
if (html != null) {
return html;
}
if (content == null) {
return null;
}
if(charset==null){
charset = CharsetDetector.guessEncoding(content); // æ ¹æ®å
容æ¥çæµ å符ç¼ç
}
try {
this.html = new String(content, charset);
return html;
} catch (UnsupportedEncodingException ex) {
ex.printStackTrace();
return null;
}
}
/*
* å¾å°ææ¡£
* */
public Document getDoc(){
if (doc != null) {
return doc;
}
try {
this.doc = Jsoup.parse(getHtml(), url);
return doc;
} catch (Exception ex) {
ex.printStackTrace();
return null;
}
}
}
PageParserToolï¼ ç±» 主è¦ä½ç¨ æä¾äº æ ¹æ®éæ©å¨æ¥éåå ç´ å±æ§ çæ¹æ³ ï¼
package com.etoak.crawl.page;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
public class PageParserTool {
/* éè¿éæ©å¨æ¥éå页é¢ç */
public static Elements select(Page page , String cssSelector) {
return page.getDoc().select(cssSelector);
}
/*
* éè¿css鿩卿¥å¾å°æå®å
ç´ ;
*
* */
public static Element select(Page page , String cssSelector, int index) {
Elements eles = select(page , cssSelector);
int realIndex = index;
if (index < 0) {
realIndex = eles.size() + index;
}
return eles.get(realIndex);
}
/**
* è·åæ»¡è¶³éæ©å¨çå
ç´ ä¸ç龿¥ éæ©å¨cssSelectorå¿
é¡»å®ä½å°å
·ä½çè¶
龿¥
* ä¾å¦æä»¬æ³æ½åid为contentçdivä¸çææè¶
龿¥ï¼è¿é
* å°±è¦å°cssSelectorå®ä¹ä¸ºdiv[id=content] a
* æ¾å
¥set ä¸ é²æ¢éå¤ï¼
* @param cssSelector
* @return
*/
public static Set<String> getLinks(Page page ,String cssSelector) {
Set<String> links = new HashSet<String>() ;
Elements es = select(page , cssSelector);
Iterator iterator = es.iterator();
while(iterator.hasNext()) {
Element element = (Element) iterator.next();
if ( element.hasAttr("href") ) {
links.add(element.attr("abs:href"));
}else if( element.hasAttr("src") ){
links.add(element.attr("abs:src"));
}
}
return links;
}
/**
* è·åç½é¡µä¸æ»¡è¶³æå®csséæ©å¨çææå
ç´ çæå®å±æ§çéå
* ä¾å¦éè¿getAttrs("img[src]","abs:src")å¯è·åç½é¡µä¸ææå¾çç龿¥
* @param cssSelector
* @param attrName
* @return
*/
public static ArrayList<String> getAttrs(Page page , String cssSelector, String attrName) {
ArrayList<String> result = new ArrayList<String>();
Elements eles = select(page ,cssSelector);
for (Element ele : eles) {
if (ele.hasAttr(attrName)) {
result.add(ele.attr(attrName));
}
}
return result;
}
}
Link å ï¼
Links ç±»: ä¸¤ä¸ªå±æ§ï¼ ä¸ä¸ªæ¯åæ¾ å·²ç»è®¿é®çurléåçset ; ä¸ä¸ªæ¯åæ¾å¾ 访é®urléåç queue ï¼
package com.etoak.crawl.link;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Set;
/*
* Link主è¦åè½;
* 1: åå¨å·²ç»è®¿é®è¿çURLè·¯å¾ å å¾
访é®çURL è·¯å¾;
*
*
* */
public class Links {
//已访é®ç url éå å·²ç»è®¿é®è¿ç 主è¦èè ä¸è½åéå¤äº 使ç¨setæ¥ä¿è¯ä¸éå¤;
private static Set visitedUrlSet = new HashSet();
//å¾
访é®ç url éå å¾
访é®ç主è¦èè 1:è§å®è®¿é®é¡ºåº;2:ä¿è¯ä¸æä¾éå¤ç带访é®å°å;
private static LinkedList unVisitedUrlQueue = new LinkedList();
//è·å¾å·²ç»è®¿é®ç URL æ°ç®
public static int getVisitedUrlNum() {
return visitedUrlSet.size();
}
//æ·»å å°è®¿é®è¿ç URL
public static void addVisitedUrlSet(String url) {
visitedUrlSet.add(url);
}
//ç§»é¤è®¿é®è¿ç URL
public static void removeVisitedUrlSet(String url) {
visitedUrlSet.remove(url);
}
//è·å¾ å¾
访é®ç url éå
public static LinkedList getUnVisitedUrlQueue() {
return unVisitedUrlQueue;
}
// æ·»å å°å¾
访é®çéåä¸ ä¿è¯æ¯ä¸ª URL åªè¢«è®¿é®ä¸æ¬¡
public static void addUnvisitedUrlQueue(String url) {
if (url != null && !url.trim().equals("") && !visitedUrlSet.contains(url) && !unVisitedUrlQueue.contains(url)){
unVisitedUrlQueue.add(url);
}
}
//å é¤ å¾
访é®çurl
public static Object removeHeadOfUnVisitedUrlQueue() {
return unVisitedUrlQueue.removeFirst();
}
//夿æªè®¿é®ç URL éå䏿¯å¦ä¸ºç©º
public static boolean unVisitedUrlQueueIsEmpty() {
return unVisitedUrlQueue.isEmpty();
}
}
LinkFilter æ¥å£ï¼ å¯ä»¥èµ·è¿æ»¤ä½ç¨ï¼
package com.etoak.crawl.link;public interface LinkFilter {
public boolean accept(String url);
}
util å·¥å ·ç±»
CharsetDetector ç±»ï¼ è·åå符ç¼ç
/*
* Copyright (C) 2014 hu
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
package com.etoak.crawl.util;
import org.mozilla.universalchardet.UniversalDetector;
import java.io.UnsupportedEncodingException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* å符éèªå¨æ£æµ
*
* @author hu
*/
public class CharsetDetector {
//ä»Nutchåé´çç½é¡µç¼ç æ£æµä»£ç
private static final int CHUNK_SIZE = 2000;
private static Pattern metaPattern = Pattern.compile(
"<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>",
Pattern.CASE_INSENSITIVE);
private static Pattern charsetPattern = Pattern.compile(
"charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE);
private static Pattern charsetPatternHTML5 = Pattern.compile(
"<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>",
Pattern.CASE_INSENSITIVE);
//ä»Nutchåé´çç½é¡µç¼ç æ£æµä»£ç
private static String guessEncodingByNutch(byte[] content) {
int length = Math.min(content.length, CHUNK_SIZE);
String str = "";
try {
str = new String(content, "ascii");
} catch (UnsupportedEncodingException e) {
return null;
}
Matcher metaMatcher = metaPattern.matcher(str);
String encoding = null;
if (metaMatcher.find()) {
Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
if (charsetMatcher.find()) {
encoding = new String(charsetMatcher.group(1));
}
}
if (encoding == null) {
metaMatcher = charsetPatternHTML5.matcher(str);
if (metaMatcher.find()) {
encoding = new String(metaMatcher.group(1));
}
}
if (encoding == null) {
if (length >= 3 && content[0] == (byte) 0xEF
&& content[1] == (byte) 0xBB && content[2] == (byte) 0xBF) {
encoding = "UTF-8";
} else if (length >= 2) {
if (content[0] == (byte) 0xFF && content[1] == (byte) 0xFE) {
encoding = "UTF-16LE";
} else if (content[0] == (byte) 0xFE
&& content[1] == (byte) 0xFF) {
encoding = "UTF-16BE";
}
}
}
return encoding;
}
/**
* æ ¹æ®åèæ°ç»ï¼çæµå¯è½çå符éï¼å¦ææ£æµå¤±è´¥ï¼è¿åutf-8
*
* @param bytes å¾
æ£æµçåèæ°ç»
* @return å¯è½çå符éï¼å¦ææ£æµå¤±è´¥ï¼è¿åutf-8
*/
public static String guessEncodingByMozilla(byte[] bytes) {
String DEFAULT_ENCODING = "UTF-8";
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(bytes, 0, bytes.length);
detector.dataEnd();
String encoding = detector.getDetectedCharset();
detector.reset();
if (encoding == null) {
encoding = DEFAULT_ENCODING;
}
return encoding;
}
/**
* æ ¹æ®åèæ°ç»ï¼çæµå¯è½çå符éï¼å¦ææ£æµå¤±è´¥ï¼è¿åutf-8
* @param content å¾
æ£æµçåèæ°ç»
* @return å¯è½çå符éï¼å¦ææ£æµå¤±è´¥ï¼è¿åutf-8
*/
public static String guessEncoding(byte[] content) {
String encoding;
try {
encoding = guessEncodingByNutch(content);
} catch (Exception ex) {
return guessEncodingByMozilla(content);
}
if (encoding == null) {
encoding = guessEncodingByMozilla(content);
return encoding;
} else {
return encoding;
}
}
}
FileTool æä»¶ä¸è½½ç±»ï¼
package com.etoak.crawl.util;
import com.etoak.crawl.page.Page;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
/* æ¬ç±»ä¸»è¦æ¯ ä¸è½½é£äºå·²ç»è®¿é®è¿çæä»¶*/
public class FileTool {
private static String dirPath;
/**
* getMethod.getResponseHeader("Content-Type").getValue()
* æ ¹æ® URL åç½é¡µç±»åçæéè¦ä¿åçç½é¡µçæä»¶åï¼å»é¤ URL ä¸çéæä»¶åå符
*/
private static String getFileNameByUrl(String url, String contentType) {
//å»é¤ http://
url = url.substring(7);
//text/html ç±»å
if (contentType.indexOf("html") != -1) {
url = url.replaceAll("[\\?/:*|<>\"]", "_") + ".html";
return url;
}
//å¦ application/pdf ç±»å
else {
return url.replaceAll("[\\?/:*|<>\"]", "_") + "." +
contentType.substring(contentType.lastIndexOf("/") + 1);
}
}
/*
* çæç®å½
* */
private static void mkdir() {
if (dirPath == null) {
dirPath = Class.class.getClass().getResource("/").getPath() + "temp\\";
}
File fileDir = new File(dirPath);
if (!fileDir.exists()) {
fileDir.mkdir();
}
}
/**
* ä¿åç½é¡µåèæ°ç»å°æ¬å°æä»¶ï¼filePath 为è¦ä¿åçæä»¶çç¸å¯¹å°å
*/
public static void saveToLocal(Page page) {
mkdir();
String fileName = getFileNameByUrl(page.getUrl(), page.getContentType()) ;
String filePath = dirPath + fileName ;
byte[] data = page.getContent();
try {
//Files.lines(Paths.get("D:\\jd.txt"), StandardCharsets.UTF_8).forEach(System.out::println);
DataOutputStream out = new DataOutputStream(new FileOutputStream(new File(filePath)));
for (int i = 0; i < data.length; i++) {
out.write(data[i]);
}
out.flush();
out.close();
System.out.println("æä»¶ï¼"+ fileName + "å·²ç»è¢«åå¨å¨"+ filePath );
} catch (IOException e) {
e.printStackTrace();
}
}
}
RegexRule æ£å表达å¼ç±»ï¼
/*
* Copyright (C) 2014 hu
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
package com.etoak.crawl.util;
import java.util.ArrayList;
import java.util.regex.Pattern;
/**
*
* @author hu
*/
public class RegexRule {
public RegexRule(){
}
public RegexRule(String rule){
addRule(rule);
}
public RegexRule(ArrayList<String> rules){
for (String rule : rules) {
addRule(rule);
}
}
public boolean isEmpty(){
return positive.isEmpty();
}
private ArrayList<String> positive = new ArrayList<String>();
private ArrayList<String> negative = new ArrayList<String>();
/**
* æ·»å ä¸ä¸ªæ£åè§å æ£åè§åæä¸¤ç§ï¼æ£æ£åå忣å
* URLç¬¦åæ£åè§åéè¦æ»¡è¶³ä¸é¢æ¡ä»¶ï¼ 1.è³å°è½å¹é
䏿¡æ£æ£å 2.ä¸è½åä»»ä½åæ£åå¹é
* æ£æ£å示ä¾ï¼+a.*cæ¯ä¸æ¡æ£æ£åï¼æ£åçå
容为a.*cï¼èµ·å§å å·è¡¨ç¤ºæ£æ£å
* 忣å示ä¾ï¼-a.*cæ¶ä¸æ¡åæ£åï¼æ£åçå
容为a.*cï¼èµ·å§åå·è¡¨ç¤ºåæ£å
* 妿ä¸ä¸ªè§åçèµ·å§å符ä¸ä¸ºå å·ä¸ä¸ä¸ºåå·ï¼å该æ£åä¸ºæ£æ£åï¼æ£åçå
容为èªèº«
* ä¾å¦a.*cæ¯ä¸æ¡æ£æ£åï¼æ£åçå
容为a.*c
* @param rule æ£åè§å
* @return èªèº«
*/
public RegexRule addRule(String rule) {
if (rule.length() == 0) {
return this;
}
char pn = rule.charAt(0);
String realrule = rule.substring(1);
if (pn == '+') {
addPositive(realrule);
} else if (pn == '-') {
addNegative(realrule);
} else {
addPositive(rule);
}
return this;
}
/**
* æ·»å ä¸ä¸ªæ£æ£åè§å
* @param positiveregex
* @return èªèº«
*/
public RegexRule addPositive(String positiveregex) {
positive.add(positiveregex);
return this;
}
/**
* æ·»å ä¸ä¸ªåæ£åè§å
* @param negativeregex
* @return èªèº«
*/
public RegexRule addNegative(String negativeregex) {
negative.add(negativeregex);
return this;
}
/**
* 夿è¾å
¥å符串æ¯å¦ç¬¦åæ£åè§å
* @param str è¾å
¥çå符串
* @return è¾å
¥å符串æ¯å¦ç¬¦åæ£åè§å
*/
public boolean satisfy(String str) {
int state = 0;
for (String nregex : negative) {
if (Pattern.matches(nregex, str)) {
return false;
}
}
int count = 0;
for (String pregex : positive) {
if (Pattern.matches(pregex, str)) {
count++;
}
}
if (count == 0) {
return false;
} else {
return true;
}
}
}
主类ï¼
MyCrawler ï¼
package com.etoak.crawl.main;
import com.etoak.crawl.link.LinkFilter;
import com.etoak.crawl.link.Links;
import com.etoak.crawl.page.Page;
import com.etoak.crawl.page.PageParserTool;
import com.etoak.crawl.page.RequestAndResponseTool;
import com.etoak.crawl.util.FileTool;
import org.jsoup.select.Elements;
import java.util.Set;
public class MyCrawler {
/**
* 使ç¨ç§ååå§å URL éå
*
* @param seeds ç§å URL
* @return
*/
private void initCrawlerWithSeeds(String[] seeds) {
for (int i = 0; i < seeds.length; i++){
Links.addUnvisitedUrlQueue(seeds[i]);
}
}
/**
* æåè¿ç¨
*
* @param seeds
* @return
*/
public void crawling(String[] seeds) {
//åå§å URL éå
initCrawlerWithSeeds(seeds);
//å®ä¹è¿æ»¤å¨ï¼æå以 http://www.baidu.com å¼å¤´ç龿¥
LinkFilter filter = new LinkFilter() {
public boolean accept(String url) {
if (url.startsWith("http://www.baidu.com"))
return true;
else
return false;
}
};
//å¾ªç¯æ¡ä»¶ï¼å¾
æåç龿¥ä¸ç©ºä¸æåçç½é¡µä¸å¤äº 1000
while (!Links.unVisitedUrlQueueIsEmpty() && Links.getVisitedUrlNum() <= 1000) {
//å
ä»å¾
访é®çåºåä¸ååºç¬¬ä¸ä¸ªï¼
String visitUrl = (String) Links.removeHeadOfUnVisitedUrlQueue();
if (visitUrl == null){
continue;
}
//æ ¹æ®URLå¾å°page;
Page page = RequestAndResponseTool.sendRequstAndGetResponse(visitUrl);
//对pageè¿è¡å¤çï¼ è®¿é®DOMçæä¸ªæ ç¾
Elements es = PageParserTool.select(page,"a");
if(!es.isEmpty()){
System.out.println("ä¸é¢å°æå°ææaæ ç¾ï¼ ");
System.out.println(es);
}
//å°ä¿åæä»¶
FileTool.saveToLocal(page);
//å°å·²ç»è®¿é®è¿ç龿¥æ¾å
¥å·²è®¿é®ç龿¥ä¸ï¼
Links.addVisitedUrlSet(visitUrl);
//å¾å°è¶
龿¥
Set<String> links = PageParserTool.getLinks(page,"img");
for (String link : links) {
Links.addUnvisitedUrlQueue(link);
System.out.println("æ°å¢ç¬åè·¯å¾: " + link);
}
}
}
//main æ¹æ³å
¥å£
public static void main(String[] args) {
MyCrawler crawler = new MyCrawler();
crawler.crawling(new String[]{"http://www.baidu.com"});
}
}
è¿è¡ç»æï¼