Guido Krüger's Web Service

SGML/HTML Scanner


Introduction

It's often neccessary to perform simple processings on SGML and/or HTML files. Since both text data and markups are freely mixed up, conventional tools like awk, sed or grep are not well suited for this purpose. I've written a fairly simple class SGMLScanner which could be used for the lexical analysis of SGML/HTML files. To perform your processing, simply derive a new class from SGMLScanner and overload the appropriate callbacks. The SGML scanner is in the gk.util.sgml package.

Example

I've written a simple example application which changes the appearance of program listings in the file. Everything that is formatted with the <pre> tag in the original file, looks like a nicely formatted paper listing containing line numbers and colored background in the resulting file.

This is what the original listings looks like:

public class Hello
{
  public static void main(String args[])
  {
    System.out.println("Hello, world");
  }
}

And here's the result after the conversion:

001 public class Hello
002 {
003   public static void main(String args[])
004   {
005     System.out.println("Hello, world");
006   }
007 }

The source code is:

package gk.util.sgml;

import java.io.*;
import gk.util.*;

public class Example1
extends SGMLScanner
{
  //Pseudo constants
  static final String NL = System.getProperty("line.separator");

  //Instance variables
  protected Writer    out;
  protected boolean   insidepre;

  public Example1(Writer out)
  {
    this.out = out;
    this.insidepre = false;
  }

  //--- Overloaded callbacks from base class --------------------
  protected void element(SGMLElement element)
  {
    String name = element.getName();
    if (name.equals("html") && element.isStartTag()) {
      write(getBuffer());
      writeln("<!--<pre> tags changed by gk.util.sgml.Example1-->");
    } else if (name.equals("pre")) {
      if (element.isStartTag()) {
        writeln("<table border cellspacing=0 cellpadding=0>");
        insidepre = true;
      } else {
        writeln("</table>");
        insidepre = false;
      }
    } else {
      write(getBuffer());
    }
  }

  protected void special(String content)
  {
    write(getBuffer());
  }

  protected void pcData(String data)
  {
    if (insidepre) {
      String line;
      boolean green = false;
      int linenum = 1;
      if (data.startsWith(NL)) {
        //ignore first NL after <pre> tag
        data = data.substring(NL.length());
      }
      while (data.length() > 0) {
        //look for next NL and set line String
        int pos = data.indexOf(NL);
        if (pos == -1) { //last line
          line = data;
          data = "";
        } else {
          line = data.substring(0, pos);
          data = data.substring(pos + NL.length());
        }
        //create table cell
        write("<tr bgcolor=\"" + (green ? "#E0FFE0" : "#FFFFFF") + "\"><td><tt>");
        //write line number
        write(Str.getFormatted("%03d ", linenum++));
        //output non-break spaces for leading spaces
        while (line.length() > 0) {
          char c = line.charAt(0);
          if (c == ' ') {
            write(" ");
            line = line.substring(1);
          } else {
            break;
          }
        }
        //output line
        write(line);
        writeln("</tt>");
        green = !green;
      }
    } else {
      //not inside <pre>: output data unchanged
      write(data);
    }
  }

  //--- private methods------------------------------
  /**
   * Writes s to the output file.
   */
  private void write(String s)
  {
    try {
      out.write(s);
    } catch (IOException e) {
      System.err.println(e.toString());
      System.exit(1);
    }
  }

  /**
   * Writes s + NL to the output file.
   */
  private void writeln(String s)
  {
    write(s + NL);
  }

  //--- main ----------------------------------------
  /**
   * Main method.
   */
  public static void main(String args[])
  {
    if (args.length != 2) {
      System.err.println("usage: java Example1 <sourcefile> <destfile>");
      System.exit(1);
    }
    try {
      Writer out = new BufferedWriter(
                   new FileWriter(args[1]));
      Reader in  = new BufferedReader(
                   new FileReader(args[0]));
      Example1 scanner = new Example1(out);
      scanner.startScanner(in);
      out.close();
      in.close();
    } catch (IOException e) {
      System.err.println(e.toString());
      System.exit(1);
    } catch (SGMLScannerException e) {
      System.err.println(e.toString());
      System.exit(1);
    }
  }
}

© 1995 - 2012 Guido Krüger - Back to top-level page