<copyright> CharEncoding class.
    Written by <a href="mailto:tiggr@ics.ele.tue.nl">Pieter J. Schoenmakers</a>

    Copyright &copy; 1996, 1997 Pieter J. Schoenmakers.

    This file is part of TOM.  TOM is distributed under the terms of the
    TOM License, a copy of which can be found in the TOM distribution; see
    the file LICENSE.

    <id>$Id: CharEncoding.t,v 1.13 1998/07/22 15:17:44 tiggr Exp $</id>
    </copyright>

<doc> The {CharacterEncoding} class defines the interface of the byte and
    character encodings for predicates and conversions.  </doc>
implementation class
CharacterEncoding: instance (All)

end;

implementation instance
CharacterEncoding: instance (All)

<doc> Return the name of this encoding.  </doc>
deferred String
  name;

/******************** character conversions ********************/
<doc><h4>Character conversions</h4></doc>

<doc> Return the decoded byte {b}, i.e. the Unicode character
    corresponding to the byte {b} in the receiving encoding.  </doc>
deferred char
  decode byte b;

<doc> Return the byte encoding of the character {c}.  If the byte
    equivalent of the character {c} does not exist in the receiving
    encoding, an {encoding-condition} is signaled, and the byte encoded is
    the {byteValue} of the object returned, or 127 if {nil} is returned.
    </doc>
deferred byte
  encode char c;

/******************** character predicates ********************/
<doc> <h4>Character predicates</h4> </doc>

<doc> Return {TRUE} the character denoted by the byte {b} in the receiving
    encoding is a letter.  </doc>
deferred boolean
  isAlpha byte b;

<doc> Return {TRUE} the character denoted by the byte {b} in the receiving
    encoding is a digit.  </doc>
deferred boolean
  isDigit byte b;

<doc> Return {TRUE} the character denoted by the byte {b} in the receiving
    encoding is a lowercase letter.  </doc>
deferred boolean
  isLower byte b;

<doc> Return {TRUE} the character denoted by the byte {b} in the receiving
    encoding is a punctuation character.  </doc>
deferred boolean
  isPunct byte b;

<doc> Return {TRUE} the character denoted by the byte {b} in the receiving
    encoding is a space character.  </doc>
deferred boolean
  isSpace byte b;

<doc> Return {TRUE} the character denoted by the byte {b} in the receiving
    encoding is a uppercase letter.  </doc>
deferred boolean
  isUpper byte b;

<doc> Return the lowercase version of the byte {b}, according to the
    receiving encoding.  If the character is not in uppercase, it is
    returned unharmed.  </doc>
deferred byte
  toLower byte b;

<doc> Return the uppercase version of the byte {b}, according to the
    receiving encoding.  If the character is not in lowercase, it is
    returned unharmed.  </doc>
deferred byte
  toUpper byte b;

<doc> Return the numeric value of the digit denoted by the byte {b} in the
    receiving encoding.  </doc>
deferred int
  digitValue byte b;

<doc> Return the index of the letter {b} relative to the start of its
    letter range.  Thus, 'a' returns 0, 'f' returns 5, etc.  </doc>
deferred int
  alphaValue byte b;

end;

<doc> An instance of the {CharEncoding} class maintains information on on
    a particular mapping for encoding a subset of Unicode characters to
    8-bit bytes.  An example of such mappings is <code>iso-8859-1</code>,
    which is the well known western european byte encoding, of which
    <code>USASCII</code> is a subset.  </doc>
implementation class
CharEncoding: State, Constants, Conditions, CharacterEncoding
{
  <doc> Currently known encodings.  </doc>
  static MutableDictionary encodings;
}

<doc> Load {num} bytes from the file with the {name} and the extension
    {ext} (sans dot).  The full path of the file is obtained from the
    {main} {Bundle}.  </doc>
ByteArray
  loadBytes int num
       from String name
  extension String ext
{
  MutableByteArray b = [MutableByteArray withCapacity num];
  String fn = [Bundle locate-file name extension ext];

  if (!fn)
    [[[stdio err] print ("unable to locate resource `", ext,
			 "' of encoding `", name, "'")] nl];
  else
    {
      File f = [File open fn input: YES flags: FILE_NOT_EXIST_RAISE];

      unwind ({[f close];})
	[f readRange (0, num) into b];
      [b freeze];
      = b;
    }
}

<doc> Return the {CharEncoding} known as the {name}.  This always
    succeeds, as a {CharEncoding} reads the resources it needs on demand.
    </doc>
instance (id)
  named String name
{
  CharEncoding enc;

  if (!encodings)
    encodings = [MutableDictionary new];
  else
    enc = encodings[name];

  if (!enc)
    {
      enc = [[self alloc] init name];
      encodings[name] = enc;
    }

  = enc;
}

end;

implementation instance
CharEncoding
{
  <doc> The name of this encoding.  </doc>
  public String name;

  <doc> The decoding map.  </doc>
  CharArray decoding;

  <doc> The encoding map.  </doc>
  IntDictionary encoding;

  <doc> The byte map for conversion to lower case within the encoding.  </doc>
  ByteArray to_lower;

  <doc> The byte map for conversion to upper case within the encoding.  </doc>
  ByteArray to_upper;

  <doc> The byte map for conversion to title case within the encoding.  </doc>
  ByteArray to_title;

  <doc> The bitmap for testing whether a byte is a digit.  </doc>
  ByteArray is_digit;

  <doc> The bitmap for testing whether a byte is a letter.  </doc>
  ByteArray is_letter;

  <doc> The bitmap for testing whether a byte is lower case.  </doc>
  ByteArray is_lower;

  <doc> The bitmap for testing whether a byte is a punctuation character.
      </doc>
  ByteArray is_punct;

  <doc> Bitmap for space predicate.  </doc>
  ByteArray is_space;

  <doc> The bitmap for testing whether a byte is upper case.  </doc>
  ByteArray is_upper;
}

<doc> Designated initializer.  </doc>
id
  init String n
{
  name = n;

  = self;
}

<doc> Return the decoded byte {b}, i.e. the Unicode character
    corresponding to the byte {b} in the receiving encoding.  </doc>
char
  decode byte b
{
  if (!decoding)
    [self decoding];

  = decoding[b];
}

<doc> Return the {decoding} map, reading it iff necessary.  </doc>
CharArray
  decoding
{
  if (!decoding)
    {
      ByteArray buf = [isa loadBytes 512 from name extension "map"];

      if (buf != nil)
	{
	  MutableCharArray map = [MutableCharArray withCapacity 256];
	  int i;

	  for (i = 0; i < 256; i++)
	    {
	      char b2 = buf[2 * i] * char (256);
	      map[i] = buf[2 * i + 1] + char (b2);
	    }

	  [map freeze];
	  decoding = map;
	}
    }

  = decoding;
}

<doc> Return the byte encoding of the character {c}.  If the byte
    equivalent of the character {c} does not exist in the receiving
    encoding, an {encoding-condition} is signaled, and the byte encoded is
    the {byteValue} of the object returned, or 127 if {nil} is returned.
    </doc>
byte
  encode char c
{
  if (!encoding)
    [self encoding];

  ByteNumber bn = encoding[c];
  byte b;
  if (bn != nil)
    b = [bn byteValue];
  else
    {
      bn = [[Condition for self class encoding-condition
		       message [[MutableByteString new]
				 print ("no encoding for char ", int (c))]]
	     signal];
      b = !bn ? byte (127) : [bn byteValue];
    }

  = b;
}

<doc> Return the {encoding} map, creating it from the {decoding} map if
    necessary.  </doc>
IntDictionary
  encoding
{
  if (!encoding)
    {
      int i;

      if (!decoding)
	[self decoding];

      MutableIntDictionary map = [MutableIntDictionary new];

      for (i = 0; i < 256; i++)
	{
	  char c = decoding[i];
	  map[c] = [ByteNumber with i];
	}

      // Freeze can't work...
      // Tue Jun 24 23:28:25 1997, tiggr@tricky.es.ele.tue.nl
      // [map freeze];
      encoding = map;
    }

  = encoding;
}

<doc> Load and return the conversion table for the {conversion} of the
    receiving encoding.  </doc>
protected ByteArray
  loadConversion String conversion
{
  String ext = [[MutableByteString new] print ("c.", conversion)];

  = [isa loadBytes 256 from name extension ext];
}

<doc> Load and return the predicate set for the {predicate} of the
    receiving encoding.  </doc>
protected ByteArray
  loadPredicateSet String predicate
{
  String ext = [[MutableByteString new] print ("p.", predicate)];

  = [isa loadBytes 32 from name extension ext];
}

<doc> Return {TRUE} the character denoted by the byte {b} in the receiving
    encoding is a letter.  </doc>
boolean
  isAlpha byte b
{
  if (!is_letter)
    is_letter = [self loadPredicateSet "letter"];
  = is_letter[b / 8] & (1 << (b % 8)) != 0;
}

<doc> Return {TRUE} the character denoted by the byte {b} in the receiving
    encoding is a digit.  </doc>
boolean
  isDigit byte b
{
  if (!is_digit)
    is_digit = [self loadPredicateSet "digit"];
  = is_digit[b / 8] & (1 << (b % 8)) != 0;
}

<doc> Return {TRUE} the character denoted by the byte {b} in the receiving
    encoding is a lowercase letter.  </doc>
boolean
  isLower byte b
{
  if (!is_lower)
    is_lower = [self loadPredicateSet "lower"];
  = is_lower[b / 8] & (1 << (b % 8)) != 0;
}

<doc> Return {TRUE} the character denoted by the byte {b} in the receiving
    encoding is a punctuation character.  </doc>
boolean
  isPunct byte b
{
  if (!is_punct)
    is_punct = [self loadPredicateSet "punct"];
  = is_punct[b / 8] & (1 << (b % 8)) != 0;
}

<doc> Return {TRUE} the character denoted by the byte {b} in the receiving
    encoding is a space character.  </doc>
boolean
  isSpace byte b
{
  if (!is_space)
    is_space = [self loadPredicateSet "space"];
  = is_space[b / 8] & (1 << (b % 8)) != 0;
}

<doc> Return {TRUE} the character denoted by the byte {b} in the receiving
    encoding is a uppercase letter.  </doc>
boolean
  isUpper byte b
{
  if (!is_upper)
    is_upper = [self loadPredicateSet "upper"];
  = is_upper[b / 8] & (1 << (b % 8)) != 0;
}

/******************** character case conversion ********************/

<doc> Return the lowercase version of the byte {b}, according to the
    receiving encoding.  If the character is not in uppercase, it is
    returned unharmed.  </doc>
byte
  toLower byte b
{
  if (!to_lower)
    to_lower = [self loadConversion "lower"];
  = to_lower[b];
}

<doc> Return the uppercase version of the byte {b}, according to the
    receiving encoding.  If the character is not in lowercase, it is
    returned unharmed.  </doc>
byte
  toUpper byte b
{
  if (!to_upper)
    to_upper = [self loadConversion "upper"];
  = to_upper[b];
}

<doc> Return the numeric value of the digit denoted by the byte {b} in the
    receiving encoding.  </doc>
int
  digitValue byte b
{
  [self unimplemented cmd];
}

<doc> Return the index of the letter {b} relative to the start of its
    letter range.  Thus, 'a' returns 0, 'f' returns 5, etc.  </doc>
int
  alphaValue byte b
{
  [self unimplemented cmd];
}

end;

/******************** USASCIIEncoding ********************/

<doc> A replacement for a real {CharEncoding} used during program
    initialization.  </doc>
implementation class
USASCIIEncoding: State, CharacterEncoding
{
  <doc> The one and only {USASCIIEncoding} object.  </doc>
  static USASCIIEncoding shared;
}

instance (id)
  shared
{
  if (!shared)
    shared = [self alloc];
  = shared;
}

end;

implementation instance
USASCIIEncoding

<doc> We're really a dummy, so we do not have a name.  In fact, that is
    how we're recognized.  </doc>
String
  name
{
  = nil;
}

<doc> This is acceptable for iso-8859-1.  </doc>
char
  decode byte b
{
  = b;
}

<doc> This is acceptable for iso-8859-1.  </doc>
byte
  encode char c
{
  = c > char (255) ? byte (127) : byte (c);
}

boolean
  isAlpha byte b
{
  = b >= 'a' && b < 'z' || b >= 'A' && b <= 'Z';
}

boolean
  isDigit byte b
{
  = b >= '0' && b <= '9';
}

boolean
  isLower byte b
{
  = b >= 'a' && b <= 'z';
}

boolean
  isPunct byte b
{
  // Incomplete...
  // Mon Nov  4 16:18:50 1996, tiggr@jaguar.ics.ele.tue.nl
  = (b == '.' || b == ',' || b == ';' || b == ':' || b == '!' || b == '?');
}

boolean
  isSpace byte b
{
  = b >= '\t' && b <= '\r' || b == ' ';
}

boolean
  isUpper byte b
{
  = b >= 'A' && b <= 'Z';
}

/******************** character case conversion ********************/

byte
  toLower byte b
{
  = [self isUpper b] ? byte (b + 32) : b;
}

byte
  toUpper byte b
{
  = [self isLower b] ? byte (b - 32) : b;
}

int
  digitValue byte b
{
  = b - '0';
}

int
  alphaValue byte b
{
  = [self isUpper b] ? b - 'A' : b - 'a';
}

end;
