diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/build.xml nutch-changed/build.xml --- nutch/build.xml 2004-08-31 13:52:03.000000000 -0400 +++ nutch-changed/build.xml 2004-09-04 15:13:44.000000000 -0400 @@ -156,6 +156,7 @@ + diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/conf/nutch-default.xml nutch-changed/conf/nutch-default.xml --- nutch/conf/nutch-default.xml 2004-09-02 18:41:15.000000000 -0400 +++ nutch-changed/conf/nutch-default.xml 2004-09-04 14:44:50.000000000 -0400 @@ -410,6 +410,19 @@ expressions used by RegexURLFilter. + + + + urlnormalizer.class + net.nutch.net.BasicUrlNormalizer + Name of the class used to normalize URLs. + + + + urlnormalizer.regex.file + regex-normalize.xml + Name of the config file used by the RegexUrlNormalizer class. + diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/conf/regex-normalize.xml.template nutch-changed/conf/regex-normalize.xml.template --- nutch/conf/regex-normalize.xml.template 1969-12-31 19:00:00.000000000 -0500 +++ nutch-changed/conf/regex-normalize.xml.template 2004-09-04 15:17:42.000000000 -0400 @@ -0,0 +1,22 @@ + + + + + + + + + (\?|\&|\&)PHPSESSID=[a-zA-Z0-9]{32}$ + + + + (\?|\&|\&)PHPSESSID=[a-zA-Z0-9]{32}(\&|\&)(.*) + $1$3 + + diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/java/net/nutch/db/Link.java nutch-changed/src/java/net/nutch/db/Link.java --- nutch/src/java/net/nutch/db/Link.java 2003-05-21 12:25:10.000000000 -0400 +++ nutch-changed/src/java/net/nutch/db/Link.java 2004-09-04 14:45:46.000000000 -0400 @@ -9,7 +9,7 @@ import net.nutch.io.*; import net.nutch.util.*; -import net.nutch.net.UrlNormalizer; +import net.nutch.net.UrlNormalizerFactory; /********************************************* * This is the field in the Link Database. @@ -56,7 +56,7 @@ public Link(MD5Hash fromID, long domainID, String urlString, String anchorText) throws MalformedURLException { this.fromID = fromID; - this.url = new UTF8(UrlNormalizer.normalize(urlString)); + this.url = new UTF8(UrlNormalizerFactory.getNormalizer().normalize(urlString)); this.domainID = domainID; // truncate long anchors diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/java/net/nutch/db/Page.java nutch-changed/src/java/net/nutch/db/Page.java --- nutch/src/java/net/nutch/db/Page.java 2003-05-21 12:25:10.000000000 -0400 +++ nutch-changed/src/java/net/nutch/db/Page.java 2004-09-04 14:46:05.000000000 -0400 @@ -9,7 +9,7 @@ import net.nutch.io.*; import net.nutch.util.*; -import net.nutch.net.UrlNormalizer; +import net.nutch.net.UrlNormalizerFactory; /********************************************* * A row in the Page Database. @@ -181,7 +181,7 @@ // public UTF8 getURL() { return url; } public void setURL(String url) throws MalformedURLException { - this.url = new UTF8(UrlNormalizer.normalize(url)); + this.url = new UTF8(UrlNormalizerFactory.getNormalizer().normalize(url)); } public MD5Hash getMD5() { return md5; } diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/java/net/nutch/net/BasicUrlNormalizer.java nutch-changed/src/java/net/nutch/net/BasicUrlNormalizer.java --- nutch/src/java/net/nutch/net/BasicUrlNormalizer.java 1969-12-31 19:00:00.000000000 -0500 +++ nutch-changed/src/java/net/nutch/net/BasicUrlNormalizer.java 2004-09-04 14:48:58.000000000 -0400 @@ -0,0 +1,70 @@ +/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ +/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ + +package net.nutch.net; + +import java.net.URL; +import java.net.MalformedURLException; +// import java.net.URI; +// import java.net.URISyntaxException; + +import java.util.logging.Logger; +import net.nutch.util.LogFormatter; + +/** Converts URLs to a normal form . */ +public class BasicUrlNormalizer implements UrlNormalizer { + public static final Logger LOG = + LogFormatter.getLogger("net.nutch.net.BasicUrlNormalizer"); + + public String normalize(String urlString) + throws MalformedURLException { + if ("".equals(urlString)) // permit empty + return urlString; + + urlString = urlString.trim(); // remove extra spaces + + URL url = new URL(urlString); + + String protocol = url.getProtocol(); + String host = url.getHost(); + int port = url.getPort(); + String file = url.getFile(); + + boolean changed = false; + + if (!urlString.startsWith(protocol)) // protocol was lowercased + changed = true; + + if ("http".equals(protocol) || "ftp".equals(protocol)) { + + if (host != null) { + String newHost = host.toLowerCase(); // lowercase host + if (!host.equals(newHost)) { + host = newHost; + changed = true; + } + } + + if (port == url.getDefaultPort()) { // uses default port + port = -1; // so don't specify it + changed = true; + } + + if (file == null || "".equals(file)) { // add a slash + file = "/"; + changed = true; + } + + if (url.getRef() != null) { // remove the ref + changed = true; + } + + } + + if (changed) + urlString = new URL(protocol, host, port, file).toString(); + + return urlString; + } + +} diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/java/net/nutch/net/RegexUrlNormalizer.java nutch-changed/src/java/net/nutch/net/RegexUrlNormalizer.java --- nutch/src/java/net/nutch/net/RegexUrlNormalizer.java 1969-12-31 19:00:00.000000000 -0500 +++ nutch-changed/src/java/net/nutch/net/RegexUrlNormalizer.java 2004-09-04 14:48:58.000000000 -0400 @@ -0,0 +1,155 @@ +/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ +/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ + +package net.nutch.net; + +import java.net.URL; +import java.net.MalformedURLException; +import java.io.IOException; +// import java.net.URI; +// import java.net.URISyntaxException; + +import java.util.List; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.logging.Logger; +import net.nutch.util.LogFormatter; + +import javax.xml.parsers.*; +import org.w3c.dom.*; +import org.apache.oro.text.regex.*; + +import net.nutch.util.*; + +/** Allows users to do regex substitutions on all/any URLs that are encountered, which + * is useful for stripping session IDs from URLs. + * + *

This class must be specified as the URL normalizer to be used in nutch-site.xml + * or nutch-default.xml. To do this specify the urlnormalizer.class property to + * have the value: net.nutch.net.RegexUrlNormalizer. The urlnormalizer.regex.file + * property should also be set to the file name of an xml file which should contain the patterns + * and substitutions to be done on encountered URLs.

+ * + * @author Luke Baker + */ +public class RegexUrlNormalizer extends BasicUrlNormalizer + implements UrlNormalizer { + + /** Class which holds a compiled pattern and its corresponding substition string. */ + private static class Rule { + public Perl5Pattern pattern; + public String substitution; + } + + private List rules; + private PatternMatcher matcher = new Perl5Matcher(); + + /** Default constructor which gets the file name from either nutch-site.xml + * or nutch-default.xml and reads that configuration file. It stores the regex patterns + * and corresponding substitutions in a List. The file should be in the CLASSPATH. */ + public RegexUrlNormalizer() throws IOException, MalformedPatternException { + String filename = NutchConf.get("urlnormalizer.regex.file"); + URL url= NutchConf.class.getClassLoader().getResource(filename); + + rules=readConfigurationFile(url.toString()); + } + + /** Constructor which can be passed the file name, so it doesn't look in the configuration files for it. */ + public RegexUrlNormalizer(String filename) + throws IOException, MalformedPatternException { + //URL url= NutchConf.class.getClassLoader().getResource(filename); + rules = readConfigurationFile(filename); + } + + + /** This function does the replacements by iterating through all the regex patterns. + * It accepts a string url as input and returns the altered string. */ + public synchronized String regexNormalize(String urlString) { + Iterator i=rules.iterator(); + while(i.hasNext()) { + Rule r=(Rule) i.next(); + urlString = Util.substitute(matcher, r.pattern, + new Perl5Substitution(r.substitution), urlString, Util.SUBSTITUTE_ALL); // actual substitution + } + return urlString; + } + + /** Normalizes any URLs by calling super.basicNormalize() + * and regexSub(). This is the function that gets called + * elsewhere in Nutch. */ + public synchronized String normalize(String urlString) + throws MalformedURLException { + urlString = super.normalize(urlString); // run basicNormalize first to ready for regexNormalize + urlString = regexNormalize(urlString); + urlString = super.normalize(urlString); // make sure regexNormalize didn't screw up the URL + return urlString; + } + + + + /** Reads the configuration file and populates a List of Rules. */ + private static List readConfigurationFile(String filename) + throws IOException, MalformedPatternException { + + Perl5Compiler compiler=new Perl5Compiler(); + List rules=new ArrayList(); + try { + + LOG.info("loading " + filename); + // borrowed heavily from code in NutchConf.java + Document doc = + DocumentBuilderFactory.newInstance().newDocumentBuilder() + .parse(filename); + Element root = doc.getDocumentElement(); + if (!"regex-normalize".equals(root.getTagName())) + LOG.severe("bad conf file: top-level element not "); + NodeList regexes = root.getChildNodes(); + for (int i = 0; i < regexes.getLength(); i++) { + Node regexNode = regexes.item(i); + if (!(regexNode instanceof Element)) + continue; + Element regex = (Element)regexNode; + if (!"regex".equals(regex.getTagName())) + LOG.warning("bad conf file: element not "); + NodeList fields = regex.getChildNodes(); + String patternValue = null; + String subValue = null; + for (int j = 0; j < fields.getLength(); j++) { + Node fieldNode = fields.item(j); + if (!(fieldNode instanceof Element)) + continue; + Element field = (Element)fieldNode; + if ("pattern".equals(field.getTagName()) && field.hasChildNodes()) + patternValue = ((Text)field.getFirstChild()).getData(); + if ("substitution".equals(field.getTagName()) && field.hasChildNodes()) + subValue = ((Text)field.getFirstChild()).getData(); + if (!field.hasChildNodes()) + subValue = ""; + } + if (patternValue != null && subValue != null) { + Rule rule=new Rule(); + rule.pattern=(Perl5Pattern) compiler.compile(patternValue); + rule.substitution=subValue; + rules.add(rule); + } + } + + } catch (Exception e) { + LOG.severe("error parsing " + filename +" conf file: " + e); + } + return rules; + } + + /** Spits out patterns and substitutions that are in the configuration file. */ + public static void main(String args[]) + throws MalformedPatternException, IOException { + RegexUrlNormalizer normalizer = new RegexUrlNormalizer(); + Iterator i=normalizer.rules.iterator(); + while(i.hasNext()) { + Rule r=(Rule) i.next(); + System.out.print(r.pattern.getPattern() + " "); + System.out.println(r.substitution); + } + } + +} diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/java/net/nutch/net/UrlNormalizerFactory.java nutch-changed/src/java/net/nutch/net/UrlNormalizerFactory.java --- nutch/src/java/net/nutch/net/UrlNormalizerFactory.java 1969-12-31 19:00:00.000000000 -0500 +++ nutch-changed/src/java/net/nutch/net/UrlNormalizerFactory.java 2004-09-04 14:48:58.000000000 -0400 @@ -0,0 +1,38 @@ +/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ +/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ + +package net.nutch.net; + +import net.nutch.util.*; +import java.util.logging.*; + +/** Factory to create a UrlNormalizer from "urlnormalizer.class" config property. */ +public class UrlNormalizerFactory { + private static final Logger LOG = + LogFormatter.getLogger("net.nutch.net.UrlNormalizerFactory"); + + private static final String URLNORMALIZER_CLASS = + NutchConf.get("urlnormalizer.class"); + + private UrlNormalizerFactory() {} // no public ctor + + private static UrlNormalizer normalizer; + + /** Return the default UrlNormalizer implementation. */ + public static UrlNormalizer getNormalizer() { + + if (normalizer == null) { + try { + LOG.info("Using URL normalizer: " + URLNORMALIZER_CLASS); + Class normalizerClass = Class.forName(URLNORMALIZER_CLASS); + normalizer = (UrlNormalizer)normalizerClass.newInstance(); + } catch (Exception e) { + throw new RuntimeException("Couldn't create "+URLNORMALIZER_CLASS, e); + } + } + + return normalizer; + + } + +} diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/java/net/nutch/net/UrlNormalizer.java nutch-changed/src/java/net/nutch/net/UrlNormalizer.java --- nutch/src/java/net/nutch/net/UrlNormalizer.java 2004-04-23 15:32:33.000000000 -0400 +++ nutch-changed/src/java/net/nutch/net/UrlNormalizer.java 2004-09-04 14:48:58.000000000 -0400 @@ -3,69 +3,12 @@ package net.nutch.net; -import java.net.URL; import java.net.MalformedURLException; -// import java.net.URI; -// import java.net.URISyntaxException; -import java.util.logging.Logger; -import net.nutch.util.LogFormatter; - -/** Converts URLs to a normal form . */ -public class UrlNormalizer { - public static final Logger LOG = - LogFormatter.getLogger("net.nutch.net.UrlNormalizer"); - - public static String normalize(String urlString) - throws MalformedURLException { - - if ("".equals(urlString)) // permit empty - return urlString; - - urlString = urlString.trim(); // remove extra spaces - - URL url = new URL(urlString); - - String protocol = url.getProtocol(); - String host = url.getHost(); - int port = url.getPort(); - String file = url.getFile(); - - boolean changed = false; - - if (!urlString.startsWith(protocol)) // protocol was lowercased - changed = true; - - if ("http".equals(protocol) || "ftp".equals(protocol)) { - - if (host != null) { - String newHost = host.toLowerCase(); // lowercase host - if (!host.equals(newHost)) { - host = newHost; - changed = true; - } - } - - if (port == url.getDefaultPort()) { // uses default port - port = -1; // so don't specify it - changed = true; - } - - if (file == null || "".equals(file)) { // add a slash - file = "/"; - changed = true; - } - - if (url.getRef() != null) { // remove the ref - changed = true; - } - - } - - if (changed) - urlString = new URL(protocol, host, port, file).toString(); - - return urlString; - } +/** Interface used to convert URLs to normal form and optionally do regex substitutions */ +public interface UrlNormalizer { + + /* Interface for URL normalization */ + public String normalize(String urlString) throws MalformedURLException; } diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/java/net/nutch/parse/Outlink.java nutch-changed/src/java/net/nutch/parse/Outlink.java --- nutch/src/java/net/nutch/parse/Outlink.java 2004-07-10 16:21:37.000000000 -0400 +++ nutch-changed/src/java/net/nutch/parse/Outlink.java 2004-09-04 14:48:19.000000000 -0400 @@ -7,7 +7,7 @@ import java.net.MalformedURLException; import net.nutch.io.*; -import net.nutch.net.UrlNormalizer; +import net.nutch.net.UrlNormalizerFactory; /* An outgoing link from a page. */ public class Outlink implements Writable { @@ -18,7 +18,7 @@ public Outlink() {} public Outlink(String toUrl, String anchor) throws MalformedURLException { - this.toUrl = UrlNormalizer.normalize(toUrl); + this.toUrl = UrlNormalizerFactory.getNormalizer().normalize(toUrl); this.anchor = anchor; } diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/test/net/nutch/net/TestBasicUrlNormalizer.java nutch-changed/src/test/net/nutch/net/TestBasicUrlNormalizer.java --- nutch/src/test/net/nutch/net/TestBasicUrlNormalizer.java 1969-12-31 19:00:00.000000000 -0500 +++ nutch-changed/src/test/net/nutch/net/TestBasicUrlNormalizer.java 2004-09-04 14:49:33.000000000 -0400 @@ -0,0 +1,48 @@ +/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ +/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ + +package net.nutch.net; + +import java.net.URL; +import junit.framework.TestCase; + +/** Unit tests for BasicUrlNormalizer. */ +public class TestBasicUrlNormalizer extends TestCase { + public TestBasicUrlNormalizer(String name) { super(name); } + + public void testNormalizer() throws Exception { + // check that leading and trailing spaces are removed + normalizeTest(" http://foo.com/ ", "http://foo.com/"); + + // check that protocol is lower cased + normalizeTest("HTTP://foo.com/", "http://foo.com/"); + + // check that host is lower cased + normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html"); + normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html"); + + // check that port number is normalized + normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html"); + normalizeTest("http://foo.com:81/", "http://foo.com:81/"); + + // check that null path is normalized + normalizeTest("http://foo.com", "http://foo.com/"); + + // check that references are removed + normalizeTest("http://foo.com/foo.html#ref", "http://foo.com/foo.html"); + +// // check that encoding is normalized +// normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html"); + } + + private void normalizeTest(String weird, String normal) throws Exception { + assertEquals(normal, UrlNormalizerFactory.getNormalizer().normalize(weird)); + } + + public static void main(String[] args) throws Exception { + new TestBasicUrlNormalizer("test").testNormalizer(); + } + + + +} diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/test/net/nutch/net/test-regex-normalize.xml nutch-changed/src/test/net/nutch/net/test-regex-normalize.xml --- nutch/src/test/net/nutch/net/test-regex-normalize.xml 1969-12-31 19:00:00.000000000 -0500 +++ nutch-changed/src/test/net/nutch/net/test-regex-normalize.xml 2004-09-04 14:49:33.000000000 -0400 @@ -0,0 +1,22 @@ + + + + + + + + + (\?|\&|\&amp;)PHPSESSID=[a-zA-Z0-9]{32}$ + + + + (\?|\&|\&amp;)PHPSESSID=[a-zA-Z0-9]{32}(\&|\&amp;)(.*) + $1$3 + + diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/test/net/nutch/net/TestRegexUrlNormalizer.java nutch-changed/src/test/net/nutch/net/TestRegexUrlNormalizer.java --- nutch/src/test/net/nutch/net/TestRegexUrlNormalizer.java 1969-12-31 19:00:00.000000000 -0500 +++ nutch-changed/src/test/net/nutch/net/TestRegexUrlNormalizer.java 2004-09-04 14:49:33.000000000 -0400 @@ -0,0 +1,39 @@ +/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ +/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ + +package net.nutch.net; + +import java.net.URL; +import junit.framework.TestCase; +import net.nutch.net.RegexUrlNormalizer; + +/** Unit tests for RegexUrlNormalizer. */ +public class TestRegexUrlNormalizer extends TestBasicUrlNormalizer { + public TestRegexUrlNormalizer(String name) { super(name); } + + public void testNormalizer() throws Exception { + normalizeTest("http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03", + "http://foo.com/foo.php?f=2"); + normalizeTest("http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&q=3", + "http://foo.com/foo.php?f=2&q=3"); + normalizeTest("http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&f=2", + "http://foo.com/foo.php?f=2"); + normalizeTest("http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03", + "http://foo.com/foo.php"); + } + + private void normalizeTest(String weird, String normal) throws Exception { + String testSrcDir = System.getProperty("test.src.dir"); + String path = testSrcDir + "/net/nutch/net/test-regex-normalize.xml"; + RegexUrlNormalizer normalizer = new RegexUrlNormalizer(path); + assertEquals(normal, normalizer.normalize(weird)); + } + + public static void main(String[] args) throws Exception { + new TestRegexUrlNormalizer("test").testNormalizer(); + new TestBasicUrlNormalizer("test").testNormalizer(); // need to make sure it passes this test too + } + + + +} diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/test/net/nutch/net/TestUrlNormalizer.java nutch-changed/src/test/net/nutch/net/TestUrlNormalizer.java --- nutch/src/test/net/nutch/net/TestUrlNormalizer.java 2004-04-23 15:32:34.000000000 -0400 +++ nutch-changed/src/test/net/nutch/net/TestUrlNormalizer.java 1969-12-31 19:00:00.000000000 -0500 @@ -1,48 +0,0 @@ -/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ -/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ - -package net.nutch.net; - -import java.net.URL; -import junit.framework.TestCase; - -/** Unit tests for UrlNormalizer. */ -public class TestUrlNormalizer extends TestCase { - public TestUrlNormalizer(String name) { super(name); } - - public void testNormalizer() throws Exception { - // check that leading and trailing spaces are removed - normalizeTest(" http://foo.com/ ", "http://foo.com/"); - - // check that protocol is lower cased - normalizeTest("HTTP://foo.com/", "http://foo.com/"); - - // check that host is lower cased - normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html"); - normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html"); - - // check that port number is normalized - normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html"); - normalizeTest("http://foo.com:81/", "http://foo.com:81/"); - - // check that null path is normalized - normalizeTest("http://foo.com", "http://foo.com/"); - - // check that references are removed - normalizeTest("http://foo.com/foo.html#ref", "http://foo.com/foo.html"); - -// // check that encoding is normalized -// normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html"); - } - - private void normalizeTest(String weird, String normal) throws Exception { - assertEquals(normal, UrlNormalizer.normalize(weird)); - } - - public static void main(String[] args) throws Exception { - new TestUrlNormalizer("test").testNormalizer(); - } - - - -}