import java.util.*;
import java.io.*;
import org.apache.oro.text.regex.*;

/**
 * This class HTMLParser_RE implements the FileParser and the LinkExtractor 
 * interface for extracting data from HTMLish documents searching for 
 * pre defined patterns in the input text.
 *
 * For now the concept of the patterns to be searched for is
 * combined with the structural knowledge of HTMLish documents, i.e.
 * the source documents start mostly with the follwoing 
 * elements/attributes/data:
 * [BOM] (1)
 * *-[<?xml * attribute="value"?>] (2)
 * *-[<html>] (3)
 * *-*-[<head>] (4)
 * *-*-*-[<meta * attribute="value"/>] (5)
 * *-*-*-[<title>value</title>] (6)
 * *-*-[</head>] (7)
 * *-*-[<body>] (8)
 * *-*-*-[<h[1-6]>value</h[1-6]>] (9)
 * *-*-*-[<p>value</p>] (10)
 * *-*-*-[<a href="value">text</a>] (11)
 *
 * (1) If there is a BOM (Byte Order Mark) check for utf-16
 * (2) If there is an XML declaration, check for encoding-attribute
 * (3) If there is a html element ...
 * (4) If there is a head element ...
 *
 * The implementation uses regular expressions to define the target values.
 * It will be user extensible, i.e. the targets and context may be defined 
 * by the caller via the underlying Map of the IGFile structure.
 * Out of the three open source java regex implementations known to the author 
 * for now <em>org.apache.oro.text.regex</em> is used (by far the <b>fastest</b> and 
 * really feature complete Perl5-compatible regex-library). 
 * It's still possible to plug-in the <em>gnu.regexp</em> package, which was 
 * initially used (also quite feature-complete but much slower than 
 * the above mentioned package). The regular expression syntax 
 * of the third <em>package org.apache.regexp</em> semms not to allow for 
 * using (?:some-grouped-only-stuff), so at the time this package 
 * has not really been tried, although considering the usage 
 * it's similar to the gnu.regexp package.
 * All three packages can be obtained e.g. at jpackage.org.
 *
 * <br>$Id: HTMLParser_RE.java,v 1.19 2002/09/17 08:40:34 sdrees Exp $
 * @see FileParser
 *
 * @author Stefan Drees
 * @version 1.19
 */
public class HTMLParser_RE implements FileParser, LinkExtractor { 

    /**
     * Aggregate coupling from IGFile and IGKeySet data structure, enrich 
     * the temporary data and correlations for optimizing/guiding the 
     * searches.
     */
    private class WantedItem {
	private IGKey id;
	private String success; 
	private String item;
	private String value;
	private String context;
	private boolean found;
	private boolean inHead;
	private boolean inBody;

	// org.apache.oro
	private PatternMatcher matcher;
	private PatternCompiler compiler;
	private Pattern contextRE;
	private PatternMatcherInput input;
	private MatchResult result;
	private HashSet values;

	public WantedItem (String what, IGKey identity, String logmsg, String how) {
	    item = what;
	    id = identity;
	    success = logmsg;
	    value = "Not found.";
	    context = how;
	    found = false;
	    inHead = false;
	    inBody = false;
	    values = new HashSet();

	    // org.apache.oro
	    compiler = new Perl5Compiler();
	    matcher  = new Perl5Matcher();
	    
	    try {
		contextRE = compiler.compile(how, Perl5Compiler.CASE_INSENSITIVE_MASK);
	    } catch(MalformedPatternException e) {
			log.addError(900, "RE_ERROR", new Object[] {e.getMessage()});
		if (LOGLEVEL >= IGLog.FILE)
		    log.add(IGLog.FILE, "Giving up.");
		return;
	    }
	}

	public boolean match(String str) {
	    input   = new PatternMatcherInput(str);
	    if (!matcher.contains(input, contextRE)) 
		return isFound(false);
	    return isFound(true);
	}
	public boolean matchAll(String str) {
	    input   = new PatternMatcherInput(str);
	    isFound(false);
	    while (matcher.contains(input, contextRE)) {
		String oneMatch = matcher.getMatch().group(1);
		values(oneMatch);
		// I know, this is extremely stupid hardcoded
		if (LOGLEVEL >= IGLog.PROGRESS)
		    log.addResource(IGLog.PROGRESS, "EXTRACT_HYPERLINK",
				    new String[]{oneMatch});
		isFound(true);
	    }
	    return isFound();
	}
	public String getMatch() {
	    if (!isFound())
		return null;
	    return value(matcher.getMatch().group(1));
	}
	public int getEndIndex() {
	    return matcher.getMatch().endOffset(0);
	}
	public String success() {
	    return success;
	}
	public String getContext() {
	    return context;
	}
	public boolean isFound() {
	    return found;
	}
	public boolean isFound(boolean already) {
	    found = already;
	    return found;
	}
	public boolean inHead() {
	    return inHead;
	}
	public boolean inHead(boolean search) {
	    inHead = search;
	    return inHead;
	}
	public boolean inBody() {
	    return inBody;
	}
	public boolean inBody(boolean search) {
	    inBody = search;
	    return inBody;
	}
	public String name() {
	    return item;
	}
	public String value() {
	    return value;
	}
	public String value(String aValue) {
	    value = aValue;
	    if (!isFound()) 
		isFound(true);
	    return value;
	}
	public void values(String aValue) {
	    values.add(aValue);
	    return;
	}
	public HashSet getValues() {
	    return values;
	}
	public IGKey id() {
	    return id;
	}
	public IGKey id(IGKey identity) {
	    id = identity;
	    return id;
	}
    }

    /** Signature of parser */
    static final String PARSER = "$Id: HTMLParser_RE.java,v 1.19 2002/09/17 08:40:34 sdrees Exp $";

    /** The default character encoding for HTML documents */
    // private final static String defaultEncoding = "UTF-8";
    // private final static String defaultEncoding = "UTF-16";
    private final static String defaultEncoding = "ISO-8859-1";
    private String encoding;
    /** Exception for bad character encodings */
    private class BadEncodingException extends Exception { }

    /** Determines if this parser will search for ... */
    private boolean wantURLs;
    private boolean wantFileType;
    private boolean wantParser;
    private boolean wantDescription;
    private boolean wantTitle;

    /** implementation specific data */
    private int firstChunkSize;    
    
    /** The logging object for this module */
    private IGLog log;

    /** The default logging level for this module */
    private final static int LOGLEVEL = 9;

    /** File extensions for HTML files */
    private final static String[] extensions = {"html", "htm", "xhtml", "shtml", 
						"php" , "asp", 
						"phtml" , "php3", "php4"};

    /** Mime types for HTML files */
    private final static String[] mimeTypes = {"text/html", "text/xml"};

    /** HTML file magic signature */
    private final static byte[][] magic = {
	{(byte) '<', (byte) 'h', (byte) 't', (byte) 'm', (byte) 'l'},
	{(byte) '<', (byte) '?', (byte) 'x', (byte) 'm', (byte) 'l'}};

    /** HTML headers aren't always at the beginning of the file */
    private final static boolean magicOffset = true;

    /** HTML headers aren't case sensitive */
    private final static boolean magicCase = false;

    /** HTML FileMagic structure */
    private final static FileMagic htmlMagic = new FileMagic(magic, magicOffset,
	    magicCase);

    private WantedItem wantedTitle;
    private WantedItem wantedHeading;
    private WantedItem wantedAuthor;
    private WantedItem wantedDescription;
    private WantedItem wantedPara;
    private WantedItem wantedKeywords;
    private WantedItem wantedEncoding;

    private Set wantedItems;

    private WantedItem wantedHead;
    private WantedItem wantedBody;
    private WantedItem wantedURLs;

    private String fileType;
    private String fileSizeBytes;    

    private String fileName;

    /** A document's URL's */
    private HashSet hyperlinks;

    /**
     * Construct a new HTMLParser. Contract says: 
     * caller will use setLog(IGLog) and setWantedItems(IGKeySet) 
     * before calling parse(IGFile) 
     */
    public HTMLParser_RE() {
	log = null;
	fileName = null;
	hyperlinks = null;
	firstChunkSize = 8192; // FIXME sdrees knows why //2048; //512;
	encoding = defaultEncoding;
	wantParser = wantFileType = wantURLs = false;
	wantTitle = wantDescription = false;
	// Construct RE for matching closing tag of html/head
	String head = "head";
	String isolateHead = "</ ?" + head + " ?>";
	wantedHead = new WantedItem( head, new IGKey(0), 
				     "HTML_FOUND_HEAD", isolateHead);
	// Construct RE for matching opening tag of html/body
	String body = "body";
	String isolateBody = "< ?" + body + " ?(?:[^>]*)>";
	wantedBody = new WantedItem( body, new IGKey(0), 
				     "HTML_FOUND_BODY", isolateBody);
    }

    /**
     * Set the desired attributes to extract
     * @param wanted A set of bits describing preferences
     */
    public void setWantedItems(IGKeySet wanted) {
	
	wantURLs = wanted.wants(IGKey.URLS);
	wantFileType = wanted.wants(IGKey.FILE_TYPE);
	wantParser = wanted.wants(IGKey.PARSER);
	// FIXME far too much hard coded boolean scalars ...
	wantTitle = wanted.wants(IGKey.TITLE);
	wantDescription = wanted.wants(IGKey.DESCRIPTION);


	hyperlinks = (wantURLs ? new HashSet() : null);

	wantedItems = new HashSet(7);

	if (wanted.wants(IGKey.AUTHOR)) {
	    // find author from '<meta name="author" content="A. Nonymous">'
	    // ... and from '<meta http-equiv="author" content="A. Nonymous">'
	    String author = "author";
	    String authorContext = "< ?meta ?(?:name|http-equiv) ?= ?(?:\"|') ?"
		+ "(?:" + author +") ?"
		+ "(?:\"|') ? content ?= ?(?:\"|') ?([^\"]*) ?(?:\"|') ?.?>";
	    wantedAuthor = new WantedItem( author, IGKey.AUTHOR, 
					   "FP_FOUND_AUTHOR", authorContext);
	    wantedAuthor.inHead(true);
	    wantedItems.add(wantedAuthor);
	}

//	if (wanted.wants(IGKey.FILE_ENCODING)) {
	    // find charset from '<meta content="text/html;charset=iso-8859-1" />'
	    // ... and from '<meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1" />'
	    // ... or from '<?xml version="1.0" charset="iso-8859-1" ?>'
	    String encoding = "charset";
	    String encodingContext = "(?:" + encoding +"|encoding) ?= ?(?:\"|\')?([^\"]*)(?:\"|\')";
	    wantedEncoding = new WantedItem( encoding, IGKey.FILE_ENCODING, 
					   "HTML_FIND_CHARSET", encodingContext);
	    wantedEncoding.inHead(true);
	    wantedItems.add(wantedEncoding);
//	}

	if (wanted.wants(IGKey.DESCRIPTION)) {
	    // find description from '<meta name="description" content="Nonsense">'
	    // ... and from '<meta http-equiv="description" content="Nonsense">'
	    String description = "description";
	    String descriptionContext = "< ?meta ?(?:name|http-equiv) ?= ?(?:\"|') ?"
		+ "(?:" + description +") ?"
		+ "(?:\"|') ? content ?= ?(?:\"|') ?([^\"]*) ?(?:\"|') ?.?>";
	    wantedDescription = new WantedItem( description, IGKey.DESCRIPTION, 
						"FP_FOUND_DESC", descriptionContext);
	    wantedDescription.inHead(true);
	    wantedItems.add(wantedDescription);

	    // second chance find the content of a html/body/p element
	    String para = "p";
	    // "< ?" + para + " ?(?:[^>]*)>(.*?(?!/ ?" + para + "))</? ?" + para + " ?>";
	    // "< ?" + para + " ?(?:[^>]*)>([^<]*?)";
	    // "< ?" + para + " ?>([^<]*?)";
	    String paraContext =  "< ?" + para + " ?(?:[^>]*)>(.*?(?!/ ?" + para + "))</? ?(?:" + para + "|h)";
	    // the WantedItem should perform the matching, possibly multi-level:
	    // discardTags = "(</? ?\\w ?(?:[^>]*)>)";
	    // discardTagsRE = new RE(discardTags, RE.REG_ICASE);
	    wantedPara = new WantedItem( para, IGKey.DESCRIPTION, 
					    "FP_FOUND_DESC", paraContext);
	    wantedPara.inBody(true);
	    wantedItems.add(wantedPara);
	}

	if (wanted.wants(IGKey.KEYWORDS)) {
	    // find keywords from '<meta name="keywords" content="A, B CDE">'
	    // ... and from '<meta http-equiv="author" content="A, B CDE">'
	    String keywords = "keywords";
	    String keywordsContext = "< ?meta ?(?:name|http-equiv) ?= ?(?:\"|') ?"
		+ "(?:" + keywords +") ?"
		+ "(?:\"|') ? content ?= ?(?:\"|') ?([^\"]*) ?(?:\"|') ?.?>";
	    wantedKeywords = new WantedItem( keywords, IGKey.KEYWORDS, 
					     "FP_FOUND_KEYWORDS", keywordsContext);
	    wantedKeywords.inHead(true);
	    wantedItems.add(wantedKeywords);
	}

	if (wanted.wants(IGKey.TITLE)) {
	    // find the content of a html/head/title element
	    String title = "title";
	    String titleContext = "< ?" + title + " ?>([^<]*)</ ?" + title + " ?>";
	    wantedTitle = new WantedItem( title, IGKey.TITLE, 
					  "FP_FOUND_TITLE", titleContext);
	    wantedTitle.inHead(true);
	    wantedItems.add(wantedTitle);

	    // second chance find the content of a html/body/h[1-6] element
	    String heading = "h";
	    String headingContext = "< ?" + heading + "(?:[1-6]) ?(?:[^>]*)>(.*?(?!/ ?" 
		+ heading + "))</ ?" + heading + "[1-6] ?>";
	    // the WantedItem should perform the matching, possibly multi-level:
	    // discardTags = "(</? ?\\w ?(?:[^>]*)>)";
	    // discardTagsRE = new RE(discardTags, RE.REG_ICASE);

	    wantedHeading = new WantedItem( heading, IGKey.TITLE, 
					    "FP_FOUND_TITLE", headingContext);
	    wantedHeading.inBody(true);
	    wantedItems.add(wantedHeading);
	}
	
	return;
    }

    /**
     * Set the logger to use with this module
     * @param logObj The object to use for logging data
     */
    public void setLog(IGLog logObj) {
	log = logObj;
	return;
    }

    /**
     * Utility routine feeds a chunk from REFilterReader into StringBuffer
     *
     * @param stringBuffer StringBuffer to read into
     * @param countChars to read
     * @param reader BufferedReader to read from
     */
    private int readChunk( StringBuffer stringBuffer, 
			   int countChars, BufferedReader reader )  {
	int ch = 0;
	int readChars = 0;
	try {
	    while (readChars < countChars && (ch = reader.read()) != -1) {
		readChars++;
		stringBuffer.append((char) ch);
	    }
	    /** 
	     * optionally read some more, until space read, 
	     * otherwise normalisation may cause ugly side effects
	     * since after stripping ws tokens, one loses information 
	     * at the end of the normalized slice
	     */
	    while ( ((char) ch != ' ') && (ch = reader.read()) != -1) {
		readChars++;
		stringBuffer.append((char) ch);
	    }
	}
	catch (IOException e) {
		log.add(IGLog.FILE, "Error reading from resource: " +
			e.getMessage());
	    if (LOGLEVEL >= IGLog.FILE)
		log.add(IGLog.FILE, "Giving up.");
	    System.exit(-1);
	}

	return readChars;
    }

    /**
     * Perform parsing on an open stream.
     * @param file The IGFile to fill in attributes for
     * @param stream The data source for parsing
     * @throws IOException if an error occurs while reading data
     */
    public void parse(IGFile file, InputStream stream)
	throws IOException, StreamResetException {

	if (LOGLEVEL >= IGLog.PROCEDURE)
	    log.add(IGLog.PROCEDURE, "HTMLParser_RE.parse(IGFile, InputStream)");

	if (log == null)
	    // FIXME
	    return;

        fileName = file.getLocation();
	// FIXME again constructor like stuff slipping into parse
	encoding = defaultEncoding;

	encoding = file.getString(IGKey.FILE_ENCODING);
	if (encoding == null) 
	    encoding = defaultEncoding.toLowerCase();
	if (LOGLEVEL >= IGLog.FILE)
	    log.add(IGLog.FILE, 
		    "Processing stream. Expected charset encoding: '" 
		    + encoding + "'");
	BufferedReader reader = new BufferedReader(
	    new InputStreamReader(stream, (encoding == null
					   ? defaultEncoding : encoding)));
	
	boolean charsetConsistent = false;
	try {
	    charsetConsistent = doParse(file, reader);
	}  
	catch (IOException ioe) {
	    throw new StreamResetException();
	}
	    
	reader.close();

	if (!charsetConsistent) {
	    if (fileName == null)
		throw new StreamResetException();

	    stream = new FileInputStream(fileName);
	    encoding = file.getString(IGKey.FILE_ENCODING);

	    if (LOGLEVEL >= IGLog.FILE)
		log.add(IGLog.FILE, 
			"Re-Processing stream. Expected charset encoding: '" 
			+ (encoding == null ? defaultEncoding : encoding) + "'");
	    reader = new BufferedReader(
		new InputStreamReader(stream, (encoding == null
					       ? defaultEncoding : encoding)));
	    
	    
	    try {
		charsetConsistent = doParse(file, reader);
	    }  
	    catch (IOException ioe) {
		throw new StreamResetException();
	    }
	    
	    reader.close();
	}

	if (LOGLEVEL >= IGLog.FILE)
	    log.add(IGLog.FILE, 
		    "All processing of stream. Done charsetConsistent now:: '" 
		    + charsetConsistent + "'");

	return;
    }

    /**
     * Perform parsing on the given source.
     * @param file The structure to have attributes extracted from and put into
     */
    public void parse(IGFile file) throws IOException, FileNotFoundException {
	if (log == null)
	    // FIXME
	    return;
	
	if (LOGLEVEL >= IGLog.PROCEDURE)
	    log.add(IGLog.PROCEDURE, "HTMLParser_RE.parse(IGFile)");
	
        fileName = file.getLocation();

	if (LOGLEVEL >= IGLog.FILE)
	    log.add(IGLog.FILE, "Processing stream from file: " + fileName);

	InputStream stream = new FileInputStream(fileName);
	try {
	    parse(file, stream);
	}
	catch (StreamResetException sre) {
		// todo: proper error message
		log.add(1, "Oops, got StreamResetException: " + sre.getMessage());
	}

	return;
    }

    /**
     * Perform real work on the given source.
     * @param reader The source to have attributes extracted from
     */
    private boolean doParse(IGFile file, BufferedReader reader) 
	    throws IOException {

	if (LOGLEVEL >= IGLog.PROCEDURE)
	    log.add(IGLog.PROCEDURE, "HTMLParser_RE.doParse(IGFile,BufferedReader)");
	
	if (LOGLEVEL >= IGLog.FILE)
	    log.add(IGLog.FILE, "Reading IGFile, chunksize: " + firstChunkSize);

	if (wantParser) {
	    file.put(IGKey.PARSER, PARSER);	    
	    if (LOGLEVEL >= IGLog.DEBUG)
		log.add(IGLog.DEBUG, "Setting IGKey.PARSER: " + PARSER);
	}

	if (wantFileType) {
	    file.put(IGKey.FILE_TYPE, new String("HTML"));
	    // FIXME log skew ahead :)
	    if (LOGLEVEL >= IGLog.DEBUG)
		log.add(IGLog.DEBUG, "Setting IGKey.FILE_TYPE: " + "HTML");
	}

	Iterator iterator = wantedItems.iterator();
	while (iterator.hasNext()) {
	    WantedItem wantedItem = (WantedItem)iterator.next();
	    // reset of internal state, since fileparsers may be reused 
	    wantedItem.isFound(false);
	    if (LOGLEVEL >= IGLog.DEBUG)
		log.add(IGLog.DEBUG, "REGEX[" + wantedItem.name() + "]: '" 
			+ wantedItem.getContext() + "'");

	}

	String repl = " ";
        StringBuffer sb = new StringBuffer();

        int ch;
	int readChars = 0;
	int prevReadChars = 0;
	int nth = 1;
	StringBuffer text = new StringBuffer();
	readChars += readChunk(text, firstChunkSize, reader);
	if (LOGLEVEL >= IGLog.FILE)
	    log.add(IGLog.FILE, "Read chunk no. " + nth 
		    + ", totals: " + readChars + " chars)");
	nth++;
	if (text.length()<80) {
	    if (LOGLEVEL >= IGLog.PROGRESS)
		log.add(IGLog.PROGRESS, "chunk no. " + nth 
		    + ", totals: " + readChars + " chars, so giving up.");
	    return true;
	}
	/** All input readChars now in text */
	if (checkUTF16(text.substring(0,2), encoding)) {
	    encoding = "utf-16";
	    file.put(IGKey.FILE_ENCODING, encoding);
	    if (LOGLEVEL >= IGLog.DEBUG)
		log.add(IGLog.DEBUG, "Checked(NOK) encoding / charset: '" 
			+ encoding + "' / '" + wantedEncoding.value()+ "'");
	    return false;
	}
	if (LOGLEVEL >= IGLog.DEBUG)
	    log.add(IGLog.DEBUG, "Checked(OK) encoding / charset: '" 
		    + encoding + "' / '" + wantedEncoding.value()+ "'");

	StringTokenizer tokens = new StringTokenizer(text.toString());
	while (tokens.hasMoreTokens())
	    sb.append(tokens.nextToken()).append(repl);
	/** All input now normalized in sb */

	if (LOGLEVEL >= IGLog.FILE)
	    log.add(IGLog.FILE, "Extracting file's header data.");

	boolean parseHead = false;
	boolean encodingFound = false;
	boolean parseBody = false;
	// FIXME billy danger says, wantedHead, -Body and -URLs are **not** in wantedItems!
	wantedHead.isFound(false);
	wantedBody.isFound(false);
	if (wantedHead.match(sb.toString())) {
	    parseHead = true;
	    if (LOGLEVEL >= IGLog.FILE)
		log.add(IGLog.FILE, "Checking charset encoding consistence.");
	    if (wantedEncoding.match(sb.toString())) {
		encodingFound = true;
		wantedEncoding.getMatch();
		file.put(wantedEncoding.id(), wantedEncoding.value());
		if (LOGLEVEL >= IGLog.PROGRESS)
		    log.addResource(IGLog.PROGRESS, wantedEncoding.success(),
				    new String[]{wantedEncoding.value()});
		if (LOGLEVEL >= IGLog.DEBUG)
		    log.add(IGLog.DEBUG, "encoding / charset: '" 
			    + encoding + "' / '" + wantedEncoding.value()+ "'");
		if (encoding != null &&
		    ! encoding.equalsIgnoreCase(wantedEncoding.value())) {
		    if (LOGLEVEL >= IGLog.INFO)
			log.addResource(IGLog.INFO, "HTML_WRONG_CHARSET",
					null);
		    return false;
		}
	    }
	}
	else {
	    prevReadChars = readChars;
	    StringBuffer text2 = new StringBuffer();
	    readChars += readChunk(text2, firstChunkSize, reader);
	    if (LOGLEVEL >= IGLog.FILE)
		log.add(IGLog.FILE, "Read additional chunk no. " + nth 
			+ ", totals: " + readChars + " chars)");
	    nth++;
	    /** All input readChars now in text2 */
	    
	    StringTokenizer tokens2 = new StringTokenizer(text2.toString());
	    while (tokens2.hasMoreTokens())
		sb.append(tokens2.nextToken()).append(repl);
	    /** All input still normalized in sb */
	    
	    if (!wantedHead.match(sb.toString())) {
		/** FIXME avoid endless loops due to <head>-less html */
		if ( readChars == prevReadChars ) {
		    if (LOGLEVEL >= IGLog.PROGRESS)
			log.addWarning(80, "HTML_NO_HEAD", new String[]{fileName});
		    if (LOGLEVEL >= IGLog.FILE)
			log.add(IGLog.FILE, "Continuing with html/body of '"
					+ fileName +"'");
		    // FIXME ... with body, because boolean parseHead set to false
		}
	    }
	    else 
		parseHead = true;

	    if (!encodingFound) {
		if (wantedEncoding.match(sb.toString())) {
		    wantedEncoding.getMatch();
		    file.put(wantedEncoding.id(), wantedEncoding.value());
		    if (LOGLEVEL >= IGLog.PROGRESS)
			log.addResource(IGLog.PROGRESS, wantedEncoding.success(),
					new String[]{wantedEncoding.value()});
		    if (LOGLEVEL >= IGLog.DEBUG)
			log.add(IGLog.DEBUG, "encoding / charset: '" 
				+ encoding + "' / '" + wantedEncoding.value()+ "'");
		    if (encoding != null &&
			! encoding.equalsIgnoreCase(wantedEncoding.value())) {
			if (LOGLEVEL >= IGLog.INFO)
			    log.addResource(IGLog.INFO, "HTML_WRONG_CHARSET",
					    null);
			return false;
		    }
		}
	    }
	}
	
	if (wantedBody.match(sb.toString())) 
	    parseBody = true;
	
	StringBuffer onlyHead;
	StringBuffer noHead;
	boolean someMissing = false;
	if (parseHead) {
	    if (LOGLEVEL >= IGLog.FILE)
		log.add(IGLog.FILE, "Searching for targets in html/head.");
	    
	    onlyHead = new StringBuffer(sb.substring(0, wantedHead.getEndIndex()));
	    if (LOGLEVEL >= IGLog.DEBUG)
		log.add(IGLog.DEBUG, "Found EndIndex of MatchHead to: " 
			+ wantedHead.getEndIndex());
	    noHead = new StringBuffer(sb.substring(wantedHead.getEndIndex(),sb.length()));
	    
	    iterator = wantedItems.iterator();
	    while (iterator.hasNext()) {
		WantedItem wantedItem = (WantedItem)iterator.next();
		// reset of internal state already done above (instance reuse!)
		if (wantedItem.isFound())
		    continue;
		if (!wantedItem.inHead()) 
		    continue;
		
		if (wantedItem.match(onlyHead.toString())) {
		    wantedItem.getMatch();
		    file.put(wantedItem.id(), wantedItem.value());
		    if (LOGLEVEL >= IGLog.PROGRESS)
			log.addResource(IGLog.PROGRESS, wantedItem.success(),
					new String[]{wantedItem.value()});
		}
		else {
		    if (LOGLEVEL >= IGLog.DEBUG)
			log.add(IGLog.DEBUG, wantedItem.success() + "[" 
				+ wantedItem.name() + "] not found in html/head." );
		    someMissing = true;
		}
		
	    }
	    if (someMissing) {
		if (LOGLEVEL >= IGLog.DEBUG)
		    log.add(IGLog.DEBUG, "html/head: '" + onlyHead + "'" );
	    }
	}

	// Construct RE for matching target of a-elements (href-attrib
	String a = "a";
	//String isolateA = "< ?" + a + " ?href ?= ?(?:\"|') ?([^\"]*) ?(?:\"|') ?.?>";
	String isolateA = "< ?" + a + ".*?href ?= ?(?:\"|') ?([^\"]*) ?(?:\"|').*?>";
	wantedURLs = new WantedItem( a, new IGKey(0), 
				     "HTML_FOUND_URL", isolateA);
	wantedURLs.isFound(false);
	boolean hyperlinksFound = false;
	// get some URLs from sb, a first hack, sorry for that
	if (wantURLs && wantedURLs.matchAll(sb.toString())) {
	    if (LOGLEVEL >= IGLog.DEBUG)
		log.add(IGLog.DEBUG, "first chunk's hyperlinks extracted." );
	    hyperlinksFound = true;	
	}
	
	if (!parseHead || someMissing) {
	    noHead = sb;
	    if (LOGLEVEL >= IGLog.DEBUG)
		log.add(IGLog.DEBUG, "html/body: '" + noHead + "'" );
	    
	    if (!parseBody && wantedBody.match(noHead.toString())) 
		parseBody = true;

	    boolean titleNotFound = (wantTitle ? !wantedTitle.isFound() : false);
	    boolean descriptionNotFound = (wantDescription ? !wantedDescription.isFound() : false);
	    
	    if (titleNotFound || descriptionNotFound) {
		boolean headingNotFound = false; 
		if (titleNotFound ) 
		    headingNotFound = !wantedHeading.match(noHead.toString()); 
		
		boolean paraNotFound = false; 
		if (descriptionNotFound )
		    paraNotFound = !wantedPara.match(noHead.toString());
		
		if ( (titleNotFound && headingNotFound) 
		     || (descriptionNotFound && paraNotFound) ) {
		    
		    prevReadChars = readChars;
		    StringBuffer text3 = new StringBuffer();
		    readChars += readChunk(text3, firstChunkSize, reader);
		    if (LOGLEVEL >= IGLog.FILE)
			log.add(IGLog.FILE, "Read additional chunk (now html/body) no. " 
				+ nth + ", totals: " + readChars + " chars)");
		    nth++;
		    /** All input readChars now in text3 */
		    
		    StringTokenizer tokens3 = new StringTokenizer(text3.toString());
		    while (tokens3.hasMoreTokens())
			noHead.append(tokens3.nextToken()).append(repl);
		    /** All input still normalized in noHead */
		    if (LOGLEVEL >= IGLog.DEBUG)
			log.add(IGLog.DEBUG, "html/whatever: '" + noHead + "'" );
		    
		    if (!parseBody && wantedBody.match(noHead.toString())) 
			parseBody = true;

		    if (titleNotFound ) 
			headingNotFound = !wantedHeading.match(noHead.toString()); 
		    
		    if (descriptionNotFound )
			paraNotFound = !wantedPara.match(noHead.toString());
		    
		    if ( (titleNotFound && headingNotFound) 
			 || (descriptionNotFound && paraNotFound) ) {
			
			/** 
			 * FIXME avoid endless loops due 
			 * to <body/p+body/h[1-6]>-less html 
			 */
			if ( readChars == prevReadChars ) {
			    if (!parseBody)
				if (LOGLEVEL >= IGLog.PROGRESS)
				    log.addWarning(81, "HTML_NO_BODY", new String[]{fileName}); 
				log.addError(82, "HTML_BODY_NO_HXP", new String[]{fileName}); 
			    if (LOGLEVEL >= IGLog.FILE)
				log.add(IGLog.FILE, "Giving up.");
			    return true;
			}
		    }
		    
		}
		if (!parseBody)
		    if (LOGLEVEL >= IGLog.PROGRESS)
			log.addWarning(81, "HTML_NO_BODY", new String[]{fileName}); 
		
		// get rest of URLs from noHead, again a first hack, sorry for that
		if (wantURLs && wantedURLs.matchAll(noHead.toString())) {
		    if (LOGLEVEL >= IGLog.DEBUG)
			log.add(IGLog.DEBUG, "last chunk's hyperlinks extracted." );
		    hyperlinksFound = true;
		}

		// FIXME (now we got all hard coded buckets filled)
		Iterator iteratorBody = wantedItems.iterator();
		while (iteratorBody.hasNext()) {
		    WantedItem wantedItem = (WantedItem)iteratorBody.next();

		    if (!wantedItem.inBody()) 
			continue;
		    
		    if (wantedItem.isFound()) {
			wantedItem.getMatch();
			file.put(wantedItem.id(), wantedItem.value());
			if (LOGLEVEL >= IGLog.PROGRESS)
			    log.addResource(IGLog.PROGRESS, wantedItem.success(),
					    new String[]{wantedItem.value()});
		    }
		    
		}
	    }
	    else {
		if (LOGLEVEL >= IGLog.PROGRESS)
		    log.add(IGLog.PROGRESS, 
			    "Error, body was scanned, but title and desc from html/head ! ");
	    }
	    
	}

	reader.close();        

	if (wantURLs && hyperlinksFound) {
	    hyperlinks = wantedURLs.getValues();
	    file.put(IGKey.URLS, hyperlinks);
	    hyperlinks = null;
	    if (LOGLEVEL >= IGLog.DEBUG)
		log.add(IGLog.DEBUG, "Setting IGKey.URLS to HashSet");
	}

	return true;
    }

    /**
     * Extract URL's from an HTML document
     * @param file The file to get URL's from
     * @return An array of URL's as String's
     */
    public String[] getLinks(IGFile file) {
	if (LOGLEVEL >= IGLog.PROCEDURE)
	    log.add(IGLog.PROCEDURE, "HTMLParser_RE.getLinks(IGFile["
		    + file.getLocation() + "])");
	HashSet urls = (HashSet) file.get(IGKey.URLS);
	file.remove(IGKey.URLS);
	return IGMisc.hashSetToStringArray(urls);
    }

    /**
     * Instruct the parser whether or not links should be collected
     * @param pref If the preference is <tt>true</tt>, links will be collected.
     * 	If <tt>false</tt>, no links will be collected.
     */
    public void wantURLs(boolean pref) {
	wantURLs = pref;
    }


    /**
     * guess charset
     */
    public boolean checkUTF16(String bom, String encoding) {
	byte[] boms = bom.getBytes();
	if ( (boms[0] == -2 && boms[1] == -1)
	     ||(boms[0] == -1 && boms[1] == -2) ) {
	    encoding = "utf-16";
	    String orderDetected;
	    if ( boms[0] == -2 || boms[0] == -1 ) 
		orderDetected = "LE";
	    else
		orderDetected = "BE";
	    
	    if (LOGLEVEL >= IGLog.DEBUG)
		log.add(IGLog.DEBUG, "Detected BOM[" + boms[0] + "," 
			+ boms[1] + "]("+ orderDetected +") should reset resource and reread as '" + encoding + "'");
	    return true;
	}
	
	if (LOGLEVEL >= IGLog.DEBUG)
	    log.add(IGLog.DEBUG, "Detected **no** BOM[" + boms[0] + "," 
		    + boms[1] + "]");
	
	return false;
    }

    /**
     * Get the file extensions this parser can handle
     * @return String array of file extensions
     */
    public String[] getExtensions() {
	return (extensions);
    }

    /**
     * Get the mime types this parser can handle
     * @return String array of mime types
     */
    public String[] getMimeTypes() {
	return (mimeTypes);
    }

    /**
     * Supply file magic for files this parser can handle
     * @return Array of byte arrays containing magic signature
     */
    public FileMagic getMagic() {
	return htmlMagic;
    }

}

/*
 * $Log: HTMLParser_RE.java,v $
 * Revision 1.19  2002/09/17 08:40:34  sdrees
 * fixing server methods, stripped unused vars
 *
 * Revision 1.18  2002/09/16 03:51:15  sdrees
 * enlarged chunk size
 *
 * Revision 1.17  2002/09/11 13:22:25  howama
 * bug fixes
 *
 * Revision 1.16  2002/09/06 19:27:34  sdrees
 * collects urls from href-attrib, resets internal class correctly, no frame src-attrib yet
 *
 * Revision 1.15  2002/09/06 14:37:56  sdrees
 * fixed body-elements having attribs
 *
 * Revision 1.14  2002/09/06 09:58:47  sdrees
 * bugfixes, IGLog.addWarning et al., merge of howama ui changes
 *
 * Revision 1.13  2002/09/04 20:08:48  howama
 * user interface work
 *
 * Revision 1.12  2002/08/30 06:17:22  sdrees
 * first encoding guessing, via BOM for utf-16 (le) introduced
 *
 * Revision 1.11  2002/08/29 20:34:01  sdrees
 * minor bugfixes, more robust pattern for <p>-content, if description not found in header
 *
 * Revision 1.10  2002/08/29 14:14:30  sdrees
 * mainly bugfixes, eg no second try for <p>-content, if description not wanted in the first place
 *
 * Revision 1.9  2002/08/28 21:55:12  sdrees
 * added encoding detection via <?xml encoding=charset ?>
 *
 * Revision 1.8  2002/08/28 16:57:12  sdrees
 * encoding tricks added to HTMLParser_RE, but only HTMLParser_RE.parse(IGFile) resetable for now
 *
 * Revision 1.7  2002/08/27 12:10:22  sdrees
 * parses now even headless text with <h[1-6]> or <p> included, bugfixes
 *
 * Revision 1.6  2002/08/26 17:10:27  sdrees
 * trying to follow the interface changes, HTMLParser_RE compiles again...;(
 *
 * Revision 1.5  2002/08/26 16:37:19  sdrees
 * inserted fallback logic again (title or h, description or p), migrated to org.apache.oro.text.regex, bugfixes, now finally faster approx 10 % than FSM based html parser
 *
 * Revision 1.4  2002/08/25 19:13:52  sdrees
 * parser now 3 times faster and much cleaner, extensible interface comes closer.
 *
 * Revision 1.3  2002/08/22 21:16:13  sdrees
 * Adapted HTMLParser_RE to split of IGKeySet in IGKey and IGKeySet, compiles again:grin)
 *
 * Revision 1.2  2002/08/22 06:07:40  sdrees
 * Adapted IGKeySet changes from mark.
 *
 * Revision 1.1  2002/08/20 07:36:07  sdrees
 * ready for cvs, implement new FileParser-API (interface), move into cvs as version 1.0, test driver now below tests.
 *
 * Revision 0.9  2002/08/18 18:00:00  sdrees
 * ready for cvs, implement new FileParser-API (interface), move into cvs as version 1.0, test driver now below tests.
 *
 * Revision 0.8  2002/08/09 19:00:00  sdrees
 * changed test (main) method to more typical usage pattern, adapted Ben's naming scheme, reverted to minmal implementation to enhance comparability of performance, lots of laziness and defensivness temporarily gone
 *
 * Revision 0.7  2002/08/01 17:03:00  sdrees
 * further preps for interface implementation (API), lazy parsing, a few bug fixes, accept meta http-equiv
 *
 * Revision 0.6  2002/08/01 10:49:44  sdrees
 * started API for ease of integration in indexgen project, store found entities in HashMap
 *
 * Revision 0.5  2002/07/31 18:07:31  sdrees
 * introduce reluctant chunky loading/parsing: phasing (guessedHeaderSize), adjusted regexes: restrict "\s*" to "\s?" due to input filtering, correct test for matches, comments adapted
 *
 * Revision 0.4  2002/07/30 17:30:19  sdrees
 * bugfixes and clarifications
 *
 * Revision 0.3  2002/07/30 10:27:22  sdrees
 * Stripping unrelated stuff, updating javadoc strings
 *
 * Revision 0.2  2002/07/28 12:12:14  sdrees
 * Testing the regexes in the wild
 *
 * Revision 0.1  2002/07/22 01:44:44  sdrees
 * Initial thoughts.
 *
 */
