<copyright> ByteString class.
    Written by <a href="mailto:tiggr@ics.ele.tue.nl">Pieter J. Schoenmakers</a>

    Copyright &copy; 1995-1997 Pieter J. Schoenmakers.

    This file is part of TOM.  TOM is distributed under the terms of the
    TOM License, a copy of which can be found in the TOM distribution; see
    the file LICENSE.

    <id>$Id: ByteString.t,v 1.62 1998/05/17 22:53:40 tiggr Exp $</id>
    </copyright>

// Read this doc...
// Mon Sep 23 14:05:57 1996, tiggr@cobra.es.ele.tue.nl

<doc> A {ByteString} is a {String} and a {ByteArray}, which can do all
    kinds of nice string-like things.

    Requesting a {substring} of a {ByteString} results in a
    {ByteSubstring} to be returned.  This will mimic a {ByteString} as
    much as possible, including hashing, equality, uniquing, printing,
    copying, etc, but they do not share a common superclass between
    {String} and {ByteString}.

    In the future, the {ByteString} instance actual functionality could be
    put into a {ByteFullstring}, enabling the {ByteSubstring} to actually
    become a subclass of {ByteString}...  </doc>
implementation class
ByteString: ByteArray, String, C, Constants
{
  <doc> The default character encoding for {ByteString} instances.

      Never refer this variable directly; always ask the string (even if
      it is {self}) for its {encoding}.  A normal {ByteString} will then
      return this {default_encoding}.  </doc>
  static CharacterEncoding default_encoding;
}

<doc> Output information on the {ByteString} unit arguments.  </doc>
OutputStream
  help OutputStream s
  done MutableKeyed done
{
  class (State) cls = [ByteString self];

  if (!done[cls])
    {
      [done add cls];

      s = [[s print "tom.ByteString
  :char-encoding <name>	set the character encoding to <name>.
			default is `iso-8859-1'"] nl];
    }
  = s;
}

<doc> Set the default byte encoding.  If it is not specified on the
    command line, {iso-8859-1} will be used.

    Before this method is invoked by the runtime library, the
    {default_encoding} will be a {USASCIIEncoding}.  </doc>
void
  load MutableArray arguments
{
  if (![default_encoding name])
    {
      String name = "iso-8859-1";
      int i, n = [arguments length];

      while (i < n)
	if ([arguments[i] equal ":char-encoding"])
	  {
	    name = arguments[i + 1];
	    [arguments removeElements (i, 2)];
	    n -= 2;
	  }
	else
	  i++;

      [self switchToEncoding name];
    }
}

<doc> Switch to the encoding with the {name}, moaning if it fails (without
    changing the current encoding).  </doc>
void
  switchToEncoding String name
{
  CharEncoding ce = [CharEncoding named name];

  /* Before assigning the real {default_encoding}, make sure that the
     decoding table has been read.  Also, if the resources can not be
     found, stick with the USASCII encoding.  */
  if ([ce decoding] != nil)
    default_encoding = ce;
  else
    [[[stdio err] print ("can't find encoding named `", name, "'")] nl];
}

end;

/******************** instance ByteString ********************/

implementation instance
ByteString

<doc> Return the Unicode character for the byte at {index}.  </doc>
char
  at int index
{
  = [[self encoding] decode self[index]];
}

// This can be removed as soon as trt uses tom io instead of C's stdio.
// Fri May 24 16:49:43 1996, tiggr@cobra.es.ele.tue.nl
// This is not acceptable in the world which knows about character encodings.
// Fri Nov  1 12:51:13 1996, tiggr@jaguar.ics.ele.tue.nl
(pointer, int)
  byteStringContents
{
  = (contents, length);
}

boolean
  equal String other
{
  = [other equalByteString self];
}

extern int
  hashRange (int, int) (start, len);

extern boolean
  equalByteString ByteString other;

boolean
  equalCharString CharString other
{
  [self unimplemented cmd];
}

boolean
  equalUniqueString UniqueString other
{
  = [other equalByteString self];
}

<doc> Initialize the newly allocated instance with the {num} bytes at {p}.
    The receiving instance will `own' the memory at {p}.  </doc>
protected id (self)
  init (pointer, int) (p, num)
{
  (contents, length) = (p, num);
}

<doc> Initialize the newly allocated instance with a copy of the {num}
    bytes at {p}.  </doc>
id (self)
  initCopy (pointer, int) (p, num)
{
  (length, contents) = (num, memcpy (malloc (num), p, num));
}

<doc> Return a new instance of the receiver's {mutableCopyClass},
    initialized with a substring from the receiver's range {(start, len)}.
    </doc>
MutableByteString
  mutableSubstring (int, int) (start, len)
{
  pointer p;

  (p, len) = [self pointerToElements (start, len)];

  = [[class (MutableByteString) ([self mutableCopyClass]) alloc]
     initCopy (p, len)];
}

String
  substring (int, int) (start, len)
{
  (start, len) = [self adjustRange (start, len)];

  = (start == 0 && len == length
     ? self : [ByteSubstring with (start, len) in self]);
}

UniqueByteString
  uniqueString
{
  = [[UniqueByteString alloc] initCopy (contents, length)];
}

OutputStream
  write OutputStream s
{
  [s writeBytes self];

  = s;
}

/******************** copying ********************/
<doc> <h4>Copying</h4> </doc>

<doc> Return the {MutableByteString} class.  </doc>
class (State)
  mutableCopyClass
{
  = [MutableByteString self];
}

/******************** encodings ********************/

<doc> Return the encoding of the receiving {ByteString}.  The default
    implementation returns the {default_encoding}.  </doc>
CharacterEncoding
  encoding
{
  = default_encoding;
}

String
  stringByDecoding String encoding_name
{
  = [self stringByDemapping [[CharEncoding named encoding_name] decoding]];
}

String
  stringByDemapping CharArray demap
{
  MutableCharString new = [MutableCharString withCapacity length];
  int i;

  /* If the {demap} is {nil}, something went wrong and we revert to the
     identical mapping (strongly resembling ISO 8859-1).  */
  if (demap)
    for (i = 0; i < length; i++)
      new[i] = ({char c = demap[self[i]];});
  else
    for (i = 0; i < length; i++)
      new[i] = ({char c = self[i];});

  if (![self mutable])
    [new freeze];

  = new;
}

/******************** predicates ********************/

<doc> Return {TRUE} the character denoted by the byte {b} in the encoding
    of the receiving string is a letter.  </doc>
boolean
  isAlpha byte b
{
  = [[self encoding] isAlpha b];
}

<doc> Return {TRUE} the character denoted by the byte {b} in the encoding
    of the receiving string is a digit.  </doc>
boolean
  isDigit byte b
{
  = [[self encoding] isDigit b];
}

<doc> Return {TRUE} the character denoted by the byte {b} in the encoding
    of the receiving string is a lowercase letter.  </doc>
boolean
  isLower byte b
{
  = [[self encoding] isLower b];
}

<doc> Return {TRUE} the character denoted by the byte {b} in the encoding
    of the receiving string is a punctuation character.  </doc>
boolean
  isPunct byte b
{
  = [[self encoding] isPunct b];
}

<doc> Return {TRUE} the character denoted by the byte {b} in the encoding
    of the receiving string is a space character.  </doc>
boolean
  isSpace byte b
{
  = [[self encoding] isSpace b];
}

<doc> Return {TRUE} the character denoted by the byte {b} in the encoding
    of the receiving string is a uppercase letter.  </doc>
boolean
  isUpper byte b
{
  = [[self encoding] isUpper b];
}

/******************** conversions ********************/

<doc> Return the lower-case version of the byte {b}, according to the
    encoding of the receiving string.  If the character is not in
    upper-case, it is returned unharmed.  </doc>
byte
  toLower byte b
{
  = [[self encoding] toLower b];
}

<doc> Return the upper-case equivalent of the byte {b}, according to the
    encoding of the receiving string.  If the character is not in
    lower-case, it is returned unharmed.  </doc>
byte
  toUpper byte b
{
  = [[self encoding] toUpper b];
}

<doc> Return the value equivalent of the byte {b}, for which this string
    should return {TRUE} when asked {isDigit}.  </doc>
int
  digitValue byte b
{
  = [[self encoding] digitValue b];
}

<doc> Return the index of the letter {b} relative to the start of its
    letter range.  Thus, 'a' returns 0, 'f' returns 5, etc.  </doc>
int
  alphaValue byte b
{
  = [[self encoding] alphaValue b];
}

// This shows that `template methods' are a good idea, since the only
// difference is the declaration of C.
// Sun Nov 17 16:46:06 1996, tiggr@jaguar.ics.ele.tue.nl

// Actually, these methods aren't all that faster!?
// Sun Nov 17 17:22:35 1996, tiggr@jaguar.ics.ele.tue.nl
<doc> This version of {downcase} overrides the implementation by {String},
    since this one is faster due to avoiding the unnecessary conversion
    to/from Unicode.  </doc>
id
  downcase
{
  int i, n = [self length];
  MutableByteString result;

  for (i = 0; i < n; i++)
    {
      byte c = self[i];

      if ([self isUpper c])
	{
	  if (!result)
	    result = MutableByteString ([self mutableSubstring (0, -1)]);
	  result[i] = [self toLower c];
	}
    }

  = !result ? self : ({if (![self mutable]) [result freeze]; result;});
}

<doc> Like {downcase}, this just is a faster implementation than the one
    provided by {String}.  </doc>
id
  upcase
{
  int i, n = [self length];
  MutableByteString result;

  for (i = 0; i < n; i++)
    {
      byte c = self[i];

      if ([self isLower c])
	{
	  if (!result)
	    result = MutableByteString ([self mutableSubstring (0, -1)]);
	  result[i] = [self toUpper c];
	}
    }

  = !result ? self : ({if (![self mutable]) [result freeze]; result;});
}

end;
