抓取网站
/**
目前优化 方向 记录抓取到的 文件路径 下次重新运行不在抓取
也就是本脚本不能在同一目录运行多次 , 多次运行 下载的文件是追加的方式写入
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.LinkedList;
import java.util.Queue;
/*
*/
public class load {
private String u;
private String encoding;
private static Queue<String> queue = new LinkedList<String>();
private static String cur_url ="http://www.zi-han.net/theme/hplus/index.html";
public static void main(String[] args) throws Exception {
queue.offer(cur_url);
String new_url="";
while((new_url=queue.poll())!=null){
load client = new load(new_url, "UTF-8");
client.run(new_url);
}
}
public load(String u, String encoding) {
this.u = u;
this.encoding = encoding;
}
public void run(String cur_url) throws Exception {
String file =cur_url.split("//")[1];
String [] path = file.split("/");
String paths ="";
for(int i =0;i <path.length-1;i++){
paths += path[i]+'/';
}
File file_foder = new File(paths);
file_foder.mkdirs();
File fp = new File( file);
URL url = new URL(u);
try{
HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
urlConnection.setRequestProperty("accept", "*/*");
urlConnection.setRequestProperty("Accept-Language","zh-CN,zh;q=0.8");
urlConnection.setRequestProperty("Cache-Control","max-age=0");
urlConnection.setRequestProperty("connection", "Keep-Alive");
urlConnection.setRequestProperty("Cookie",cookie);
// urlConnection.setRequestProperty("Host","www.zjtax.gov.cn");
urlConnection.setRequestProperty("user-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
urlConnection.setDoOutput(true);
urlConnection.setDoInput(true);
BufferedReader reader = new BufferedReader(new InputStreamReader(urlConnection.getInputStream(), encoding));
String line="";
String inner="";
while ((line = reader.readLine())!= null)
{
if(line.indexOf(".js\"")>0|| line.indexOf(".js?")>0){
inner =line.substring(line.indexOf("src=")+5,line.indexOf(".js")+3);
}
if(line.indexOf(".css\"")>0||line.indexOf(".css?")>0){
inner =line.substring(line.indexOf("href=")+6,line.indexOf(".css")+4);
}
if(line.indexOf("html")>0 && line.indexOf("href")>0 && line.indexOf("data-id")>0){
inner =line.substring(line.indexOf("data-id=")+9,line.indexOf("html")+4);
}
if(line.indexOf("html")>0 && line.indexOf("href")>0 && line.indexOf("data-id")<0){
inner =line.substring(line.indexOf("href=")+6,line.indexOf("html")+4);
}
if(line.indexOf("html")>0 && line.indexOf("action")>0 ){
inner =line.substring(line.indexOf("action=")+8,line.indexOf("html")+4);
}
if(inner.length()>0&& inner.indexOf("http")<0){
String per="";
per = cur_url.substring(0,cur_url.lastIndexOf("/"));
if(inner.charAt(1) == '.')
{
String px =cur_url.substring(0,cur_url.lastIndexOf("/"));
per = px.substring(0,px.lastIndexOf("/"));
}
String new_path =per+"/"+inner;
queue.offer(new_path);
}
addfile(line,fp);
}
}catch(Exception e){
System.out.println("error once");
}
}
public void addfile(String line,File fp) throws IOException
{
try {
String str = "\r\n";
FileOutputStream af = new FileOutputStream(fp,true);
OutputStreamWriter oStreamWriter = new OutputStreamWriter(new FileOutputStream(fp,true), "utf-8");
oStreamWriter.append(line);
oStreamWriter.append(str);
oStreamWriter.close();
} catch (FileNotFoundException e)
{
System.out.println("error!");
e.printStackTrace();
}
}
}