| « | 五月 2012 | » | ||||
|---|---|---|---|---|---|---|
| 一 | 二 | 三 | 四 | 五 | 六 | 日 |
| 1 | 2 | 3 | 4 | 5 | 6 | |
| 7 | 8 | 9 | 10 | 11 | 12 | 13 |
| 14 | 15 | 16 | 17 | 18 | 19 | 20 |
| 21 | 22 | 23 | 24 | 25 | 26 | 27 |
| 28 | 29 | 30 | 31 | |||
实现spider简单原理,配合lucene可简单实现google功能
1 定义接口
import java.net.URL;
/**
* @author Administrator
*
* TODO To change the template for this generated type comment go to
* Window - Preferences - Java - Code Style - Code Templates
*/
public interface ISpiderReportable{
public boolean spiderFoundURL(URL base,URL url);
public void spiderURLError(URL url);
public void spiderFoundEMail(String email);
}
2 html解析
import javax.swing.text.html.HTMLEditorKit;
/**
* @author Administrator
*
* TODO To change the template for this generated type comment go to Window -
* Preferences - Java - Code Style - Code Templates
*/
public class HTMLParse extends HTMLEditorKit {
public HTMLEditorKit.Parser getParser() {
return super.getParser();
}
}
3 spider
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import net.sf.hibernate.id.UUIDHexGenerator;
/**
* @author Administrator
*
* TODO To change the template for this generated type comment go to Window -
* Preferences - Java - Code Style - Code Templates
*/
public class Spider {
/**
* A collection of URLs that resulted in an error
*/
protected Collection workloadError = new ArrayList(3);
/**
* A collection of URLs that are waiting to be processed
*/
protected Collection workloadWaiting = new ArrayList(3);
/**
* A collection of URLs that were processed
*/
protected Collection workloadProcessed = new ArrayList(3);
/**
* The class that the spider should report its URLs to
*/
protected ISpiderReportable report;
/**
* A flag that indicates whether this process should be canceled
*/
protected boolean cancel = false;
/**
* The constructor
*
* @param report
* A class that implements the ISpiderReportable interface, that
* will receive information that the spider finds.
*/
public Spider(ISpiderReportable report) {
this.report = report;
}
/**
* Get the URLs that resulted in an error.
*
* @return A collection of URL's.
*/
public Collection getWorkloadError() {
return workloadError;
}
/**
* Get the URLs that were waiting to be processed. You should add one URL to
* this collection to begin the spider.
*
* @return A collection of URLs.
*/
public Collection getWorkloadWaiting() {
return workloadWaiting;
}
/**
* Get the URLs that were processed by this spider.
*
* @return A collection of URLs.
*/
public Collection getWorkloadProcessed() {
return workloadProcessed;
}
/**
* Clear all of the workloads.
*/
public void clear() {
getWorkloadError().clear();
getWorkloadWaiting().clear();
getWorkloadProcessed().clear();
}
/**
* Set a flag that will cause the begin method to return before it is done.
*/
public void cancel() {
cancel = true;
}
/**
* Add a URL for processing.
*
* @param url
*/
public void addURL(URL url) {
if (getWorkloadWaiting().contains(url))
return;
if (getWorkloadError().contains(url))
return;
if (getWorkloadProcessed().contains(url))
return;
// log("Adding to workload: " + url );
getWorkloadWaiting().add(url);
}
/**
* Called internally to process a URL
*
* @param url
* The URL to be processed.
*/
public void processURL(URL url) {
try {
//log("Processing: " + url);
// get the URL's contents
URLConnection connection = url.openConnection();
if ((connection.getContentType() != null)
&& !connection.getContentType().toLowerCase().startsWith(
"text/")) {
getWorkloadWaiting().remove(url);
getWorkloadProcessed().add(url);
log("Not processing because content type is: "
+ connection.getContentType());
return;
}
// read the URL
InputStream is = connection.getInputStream();
Reader r = new InputStreamReader(is);
// parse the URL
HTMLEditorKit.Parser parse = new HTMLParse().getParser();
parse.parse(r, new Parser(url), true);
} catch (IOException e) {
getWorkloadWaiting().remove(url);
getWorkloadError().add(url);
//log("Error: " + url);
report.spiderURLError(url);
return;
}
// mark URL as complete
getWorkloadWaiting().remove(url);
getWorkloadProcessed().add(url);
log("Complete: " + url);
saveURL(url);
}
/**
* Called to start the spider
*/
public void begin() {
cancel = false;
while (!getWorkloadWaiting().isEmpty() && !cancel) {
Object list[] = getWorkloadWaiting().toArray();
for (int i = 0; (i < list.length) && !cancel; i++)
processURL((URL) list[i]);
}
}
/**
* A HTML parser callback used by this class to detect links
*
* @author Jeff Heaton
* @version 1.0
*/
protected class Parser extends HTMLEditorKit.ParserCallback {
protected URL base;
public Parser(URL base) {
this.base = base;
}
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
String href = (String) a.getAttribute(HTML.Attribute.HREF);
if ((href == null) && (t == HTML.Tag.FRAME))
href = (String) a.getAttribute(HTML.Attribute.SRC);
if (href == null)
return;
int i = href.indexOf('#');
if (i != -1)
href = href.substring(0, i);
if (href.toLowerCase().startsWith("mailto:")) {
report.spiderFoundEMail(href);
return;
}
handleLink(base, href);
}
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
handleSimpleTag(t, a, pos); // handle the same way
}
protected void handleLink(URL base, String str) {
try {
URL url = new URL(base, str);
if (report.spiderFoundURL(base, url))
addURL(url);
} catch (MalformedURLException e) {
//log("Found malformed URL: " + str );
}
}
}
/**
* Called internally to log information This basic method just writes the
* log out to the stdout.
*
* @param entry
* The information to be written to the log.
*/
public void log(String entry) {
System.out.println((new Date()) + ":" + entry);
}
public void saveURL(URL url) {
HttpURLConnection urlcon;
try {
urlcon = (HttpURLConnection) url.openConnection();
urlcon.connect();
InputStream in = urlcon.getInputStream();
BufferedInputStream bis = new BufferedInputStream(in);
String id=(String)new UUIDHexGenerator().generate(null,null);
System.out.println(url+"--------------"+id);
FileOutputStream fout = new FileOutputStream(new File(
"E: empsearch"+id+".htm"));
System.out.println(urlcon.getContentEncoding());
byte[] buf = new byte[128];
int k = 0;
while ((k = bis.read(buf)) != -1) {
fout.write(buf, 0, k);
}
in.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
4 实现接口
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
/**
* @author Administrator
*
* TODO To change the template for this generated type comment go to Window -
* Preferences - Java - Code Style - Code Templates
*/
public class CheckLinks implements Runnable,
ISpiderReportable {
/**
* The constructor. Perform setup here.
*/
public CheckLinks(URL url) {
this.base=url;
}
/**
* Main method for the application
*
* @param args
* Not used
*/
static public void main(String args[]) {
try {
new CheckLinks(new URL(args[0])).begin();
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//{{DECLARE_MENUS
//}}
/**
* The background spider thread
*/
protected Thread backgroundThread;
/**
* The spider object being used
*/
protected Spider spider;
/**
* The URL that the spider began with
*/
protected URL base;
/**
* How many bad links have been found
*/
protected int badLinksCount = 0;
/**
* How many good links have been found
*/
protected int goodLinksCount = 0;
/**
* Called when the begin or cancel buttons are clicked
*
* @param event
* The event associated with the button.
*/
void begin() {
if (backgroundThread == null) {
backgroundThread = new Thread(this);
backgroundThread.start();
goodLinksCount = 0;
badLinksCount = 0;
} else {
spider.cancel();
}
}
/**
* Perform the background thread operation. This method actually starts the
* background thread.
*/
public void run() {
spider = new Spider(this);
spider.clear();
spider.addURL(base);
spider.begin();
backgroundThread = null;
}
/**
* Called by the spider when a URL is found. It is here that links are
* validated.
*
* @param base
* The page that the link was found on.
* @param url
* The actual link address.
*/
public boolean spiderFoundURL(URL base, URL url) {
if (!checkLink(url)) {
badLinksCount++;
return false;
}
goodLinksCount++;
if (!url.getHost().equalsIgnoreCase(base.getHost()))
return false;
else
return true;
}
/**
* Called when a URL error is found
*
* @param url
* The URL that resulted in an error.
*/
public void spiderURLError(URL url) {
}
/**
* Called internally to check whether a link is good
*
* @param url
* The link that is being checked.
* @return True if the link was good, false otherwise.
*/
protected boolean checkLink(URL url) {
try {
URLConnection connection = url.openConnection();
connection.connect();
return true;
} catch (IOException e) {
return false;
}
}
/**
* Called when the spider finds an e-mail address
*
* @param email
* The email address the spider found.
*/
public void spiderFoundEMail(String email) {
}
}















