public class Access implements Runnable{
HttpURLConnection huc;
InputStream is;
BufferedReader reader;
String url;
public Access(){
try {
url="";
} catch (Exception e) {
e.printStackTrace();
}
try {
huc=(HttpURLConnection)new URL(url).openConnection();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
new Thread(this).start();
}
public void run() {
try {
huc.setRequestMethod("GET");
} catch (ProtocolException e) {
e.printStackTrace();
}
try {
huc.setUseCaches(true);
huc.connect();
} catch (IOException e) {
e.printStackTrace();
}
try {
is=huc.getInputStream();
reader=new BufferedReader(new InputStreamReader(is,huc.getContentType().equals("text-html; charset=gb2312")?"gb2312":"UTF-8"));
StringBuffer temp=new StringBuffer();
String str;
while((str=reader.readLine())!=null){
temp.append(str+"\n");
}
System.out.println(new String(temp));
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
reader.close();
is.close();
huc.disconnect();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
该爬虫设计的关键:
1.control,交互界面,对爬虫的控制
2.analysis HTML,对HTML进行分析,从中提取心得hot link.
3.多线程.并发抓取页面
阅读(4429) | 评论(0) | 转发(0) |