blaze

欢迎来到blaze>>   | 首页 资源中心 | solaris | Ruby on rails | ajax | oracle | JCOM | tapestry | WorkFlow | 我的项目 | JSF | 乱七八糟 | spring | opensource | struts | hibernate | eclipse | 灵光乍现 | ITPUB论坛

搜索引擎简版

发表人:fourfire | 发表时间: 2005年三月09日, 11:18

实现spider简单原理,配合lucene可简单实现google功能

1 定义接口

import java.net.URL;

/**
 * @author Administrator
 *
 * TODO To change the template for this generated type comment go to
 * Window - Preferences - Java - Code Style - Code Templates
 */
public interface ISpiderReportable{ 

 public boolean spiderFoundURL(URL base,URL url);
   public void spiderURLError(URL url);
   public void spiderFoundEMail(String email);
}

2 html解析

import javax.swing.text.html.HTMLEditorKit;

/**
 * @author Administrator
 *
 * TODO To change the template for this generated type comment go to Window -
 * Preferences - Java - Code Style - Code Templates
 */
public class HTMLParse extends HTMLEditorKit {

 public HTMLEditorKit.Parser getParser() {
  return super.getParser();
 }
}

3 spider

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;

import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;

import net.sf.hibernate.id.UUIDHexGenerator;

/**
 * @author Administrator
 *
 * TODO To change the template for this generated type comment go to Window -
 * Preferences - Java - Code Style - Code Templates
 */
public class Spider {

 /**
  * A collection of URLs that resulted in an error
  */
 protected Collection workloadError = new ArrayList(3);

 /**
  * A collection of URLs that are waiting to be processed
  */
 protected Collection workloadWaiting = new ArrayList(3);

 /**
  * A collection of URLs that were processed
  */
 protected Collection workloadProcessed = new ArrayList(3);

 /**
  * The class that the spider should report its URLs to
  */
 protected ISpiderReportable report;

 /**
  * A flag that indicates whether this process should be canceled
  */
 protected boolean cancel = false;

 /**
  * The constructor
  *
  * @param report
  *            A class that implements the ISpiderReportable interface, that
  *            will receive information that the spider finds.
  */
 public Spider(ISpiderReportable report) {
  this.report = report;
 }

 /**
  * Get the URLs that resulted in an error.
  *
  * @return A collection of URL's.
  */
 public Collection getWorkloadError() {
  return workloadError;
 }

 /**
  * Get the URLs that were waiting to be processed. You should add one URL to
  * this collection to begin the spider.
  *
  * @return A collection of URLs.
  */
 public Collection getWorkloadWaiting() {
  return workloadWaiting;
 }

 /**
  * Get the URLs that were processed by this spider.
  *
  * @return A collection of URLs.
  */
 public Collection getWorkloadProcessed() {
  return workloadProcessed;
 }

 /**
  * Clear all of the workloads.
  */
 public void clear() {
  getWorkloadError().clear();
  getWorkloadWaiting().clear();
  getWorkloadProcessed().clear();
 }

 /**
  * Set a flag that will cause the begin method to return before it is done.
  */
 public void cancel() {
  cancel = true;
 }

 /**
  * Add a URL for processing.
  *
  * @param url
  */
 public void addURL(URL url) {
  if (getWorkloadWaiting().contains(url))
   return;
  if (getWorkloadError().contains(url))
   return;
  if (getWorkloadProcessed().contains(url))
   return;
  // log("Adding to workload: " + url );
  getWorkloadWaiting().add(url);
 }

 /**
  * Called internally to process a URL
  *
  * @param url
  *            The URL to be processed.
  */
 public void processURL(URL url) {
  try {
   //log("Processing: " + url);
   // get the URL's contents
   URLConnection connection = url.openConnection();
   if ((connection.getContentType() != null)
     && !connection.getContentType().toLowerCase().startsWith(
       "text/")) {
    getWorkloadWaiting().remove(url);
    getWorkloadProcessed().add(url);
    log("Not processing because content type is: "
      + connection.getContentType());
    return;
   }

   // read the URL
   InputStream is = connection.getInputStream();
   Reader r = new InputStreamReader(is);
   // parse the URL
   HTMLEditorKit.Parser parse = new HTMLParse().getParser();
   parse.parse(r, new Parser(url), true);
  } catch (IOException e) {
   getWorkloadWaiting().remove(url);
   getWorkloadError().add(url);
   //log("Error: " + url);
   report.spiderURLError(url);
   return;
  }
  // mark URL as complete
  getWorkloadWaiting().remove(url);
  getWorkloadProcessed().add(url);
  log("Complete: " + url);
  saveURL(url);
 }

 /**
  * Called to start the spider
  */
 public void begin() {
  cancel = false;
  while (!getWorkloadWaiting().isEmpty() && !cancel) {
   Object list[] = getWorkloadWaiting().toArray();
   for (int i = 0; (i < list.length) && !cancel; i++)
    processURL((URL) list[i]);
  }
 }

 /**
  * A HTML parser callback used by this class to detect links
  *
  * @author Jeff Heaton
  * @version 1.0
  */
 protected class Parser extends HTMLEditorKit.ParserCallback {
  protected URL base;

  public Parser(URL base) {
   this.base = base;
  }

  public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
   String href = (String) a.getAttribute(HTML.Attribute.HREF);

   if ((href == null) && (t == HTML.Tag.FRAME))
    href = (String) a.getAttribute(HTML.Attribute.SRC);

   if (href == null)
    return;

   int i = href.indexOf('#');
   if (i != -1)
    href = href.substring(0, i);

   if (href.toLowerCase().startsWith("mailto:")) {
    report.spiderFoundEMail(href);
    return;
   }

   handleLink(base, href);
  }

  public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
   handleSimpleTag(t, a, pos); // handle the same way

  }

  protected void handleLink(URL base, String str) {
   try {
    URL url = new URL(base, str);
    if (report.spiderFoundURL(base, url))
     addURL(url);
   } catch (MalformedURLException e) {
    //log("Found malformed URL: " + str );
   }
  }
 }

 /**
  * Called internally to log information This basic method just writes the
  * log out to the stdout.
  *
  * @param entry
  *            The information to be written to the log.
  */
 public void log(String entry) {
  System.out.println((new Date()) + ":" + entry);
 }

 public void saveURL(URL url) {
  HttpURLConnection urlcon;
  try {
   urlcon = (HttpURLConnection) url.openConnection();
   urlcon.connect();
   InputStream in = urlcon.getInputStream();
   BufferedInputStream bis = new BufferedInputStream(in);
   String id=(String)new UUIDHexGenerator().generate(null,null);
   System.out.println(url+"--------------"+id);
   FileOutputStream fout = new FileOutputStream(new File(
     "E: empsearch"+id+".htm"));
   System.out.println(urlcon.getContentEncoding());
   byte[] buf = new byte[128];
   int k = 0;
   while ((k = bis.read(buf)) != -1) {

    fout.write(buf, 0, k);
   }
   in.close();
  } catch (IOException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  }

 }
}

4 实现接口

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

/**
 * @author Administrator
 *
 * TODO To change the template for this generated type comment go to Window -
 * Preferences - Java - Code Style - Code Templates
 */
public class CheckLinks  implements Runnable,
  ISpiderReportable {

 /**
  * The constructor. Perform setup here.
  */
 public CheckLinks(URL url) {
  this.base=url;
 }

 /**
  * Main method for the application
  *
  * @param args
  *            Not used
  */
 static public void main(String args[]) {
  try {
   new CheckLinks(new URL(args[0])).begin();
  } catch (MalformedURLException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  }
 }

 
 //{{DECLARE_MENUS
 //}}

 /**
  * The background spider thread
  */
 protected Thread backgroundThread;

 /**
  * The spider object being used
  */
 protected Spider spider;

 /**
  * The URL that the spider began with
  */
 protected URL base;

 /**
  * How many bad links have been found
  */
 protected int badLinksCount = 0;

 /**
  * How many good links have been found
  */
 protected int goodLinksCount = 0;

 

 /**
  * Called when the begin or cancel buttons are clicked
  *
  * @param event
  *            The event associated with the button.
  */
 void begin() {
  if (backgroundThread == null) {
   
   backgroundThread = new Thread(this);
   backgroundThread.start();
   goodLinksCount = 0;
   badLinksCount = 0;
  } else {
   spider.cancel();
  }

 }

 /**
  * Perform the background thread operation. This method actually starts the
  * background thread.
  */
 public void run() {
  spider = new Spider(this);
  spider.clear();
  
  spider.addURL(base);
  spider.begin();
  
  backgroundThread = null;
 }

 /**
  * Called by the spider when a URL is found. It is here that links are
  * validated.
  *
  * @param base
  *            The page that the link was found on.
  * @param url
  *            The actual link address.
  */
 public boolean spiderFoundURL(URL base, URL url) {
  

  if (!checkLink(url)) {
   
   badLinksCount++;
   return false;
  }

  goodLinksCount++;
  if (!url.getHost().equalsIgnoreCase(base.getHost()))
   return false;
  else
   return true;
 }

 /**
  * Called when a URL error is found
  *
  * @param url
  *            The URL that resulted in an error.
  */
 public void spiderURLError(URL url) {
 }

 /**
  * Called internally to check whether a link is good
  *
  * @param url
  *            The link that is being checked.
  * @return True if the link was good, false otherwise.
  */
 protected boolean checkLink(URL url) {
  try {
   URLConnection connection = url.openConnection();
   connection.connect();
   return true;
  } catch (IOException e) {
   return false;
  }
 }

 /**
  * Called when the spider finds an e-mail address
  *
  * @param email
  *            The email address the spider found.
  */
 public void spiderFoundEMail(String email) {
 }


 
 
}

发表评论

标题

在此添加评论

称呼

邮箱地址(可选)

个人主页(可选)




Valid XHTML 1.0 Strict and CSS. Powered by pLog
Design by Blog.lvwo.com