/*-*-c++-*---------------------------------------------*\
  cloak.pike: Hides/mutilates mailto links for search
	      engines and similar robots scooping your
	      site for mail addresses, rendering your
	      mail addresses a bit more spam safe
	      without your using ugly crippled mail
	      addresses like foo@nospam.bar.baz etc

  Written by: Johan Sundstrm <jhs@lysator.liu.se>

  $Id: cloak.pike,v 1.2 1999/09/05 05:28:59 johan Exp $
\*-----------------------------------------------------*/

#include <module.h>
inherit "module";
inherit "roxenlib";

string mailuser, mailhost;

constant cvs_version = "$Id: cloak.pike,v 1.2 1999/09/05 05:28:59 johan Exp $";
#define EXAMPLE_ADDRESS "j.random@hacker.org"

// Of course we only make realistic mail addressess... ;-)
// First up, the characters allowed by RFC 821 for mail addresses:
constant username_chars = "!#$%&'*+-./0123456789=?ABCDEFGHIJKLMNOPQRS"
                          "TUVWXYZ\\^_`abcdefghijklmnopqrstuvwxyz{|}~";

constant host_first_char = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
                           "abcdefghijklmnopqrstuvwxyz";
constant host_last_char = "ABCDEFGHIJKLMNOPQRSTUVWXYZ01234"
                          "abcdefghijklmnopqrstuvwxyz56789";
constant host_rest_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ01234"
                          "-abcdefghijklmnopqrstuvwxyz56789";

// Then, the list of valid top domains:
constant topdomains = ({ "com", "edu", "gov", "mil", "net", "org", "ac",
 "ad", "ae", "af", "ag", "ai", "al", "am", "an", "ao", "aq", "ar", "as",
 "at", "au", "aw", "az", "ba", "bb", "bd", "be", "bf", "bg", "bh", "bi",
 "bj", "bm", "bn", "bo", "br", "bs", "bt", "bv", "bw", "by", "bz", "ca",
 "cc", "cf", "cg", "ch", "ci", "ck", "cl", "cm", "cn", "co", "cr", "cs",
 "cu", "cv", "cx", "cy", "cz", "de", "dj", "dk", "dm", "do", "dz", "ec",
 "ee", "eg", "eh", "eq", "es", "et", "fi", "fj", "fk", "fm", "fo", "fr",
 "ga", "gb", "gd", "ge", "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gp",
 "gq", "gr", "gt", "gu", "gw", "gy", "hk", "hm", "hn", "hr", "ht", "hu",
 "id", "ie", "il", "im", "in", "io", "iq", "ir", "is", "it", "je", "jm",
 "jo", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr", "kw", "ky",
 "kz", "la", "lb", "lc", "li", "lk", "ln", "lr", "ls", "lt", "lu", "lv",
 "ly", "ma", "mc", "md", "mg", "mh", "mk", "ml", "mm", "mn", "mo", "mp",
 "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx", "my", "mz", "na", "nc",
 "ne", "nf", "ng", "ni", "nl", "no", "np", "nr", "nt", "nu", "nz", "om",
 "pa", "pe", "pf", "pg", "ph", "pk", "pl", "pm", "pn", "pr", "pt", "pw",
 "py", "qa", "re", "ro", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg",
 "sh", "si", "sj", "sk", "sl", "sm", "sn", "so", "sr", "st", "su", "sv",
 "sy", "sz", "tc", "td", "tf", "tg", "th", "tj", "tk", "tm", "tn", "to",
 "tp", "tr", "tt", "tv", "tw", "tz", "ua", "ug", "uk", "un", "us", "uy",
 "uz", "va", "vc", "ve", "vg", "vi", "vn", "vu", "wf", "ws", "ye", "yu",
 "za", "zm", "zw" });

string random_name()
{
  string name = "";
  int size = random(8)+1, idx;
  while(size-- > 0)
  {
    idx = random(sizeof(username_chars));
    name += username_chars[idx..idx];
  }
  return name;
}

string random_host()
{
  string host = "";
  int depth = random(3) + 1, element_length, idx;
  while(depth-- > 0)
  {
    idx = random(sizeof(host_first_char));
    host += host_first_char[idx..idx];

    element_length = random(10);
    while(element_length-- > 0)
    {
      idx = random(sizeof(host_rest_chars));
      host += host_rest_chars[idx..idx];
    }

    idx = random(sizeof(host_last_char));
    host += host_last_char[idx..idx] + ".";
  }
  return host + topdomains[random(sizeof( topdomains ))];
}

string substitute(string address)
{
  string name, host;
  switch(QUERY(user))
  {
    case "selected at random":
      name = random_name();
      break;

    case "left unchanged":
      name = address;
      sscanf(address, "%s@", name);
      break;

    default:
      name = mailuser;
  }

  switch(QUERY(host))
  {
    case "selected at random":
      host = random_host();
      break;

    case "left unchanged":
      host = "";
      sscanf(address, "%*s@%s", host);
      break;

    default:
      host = mailhost;
  }
  return name + "@" + host;
}

array register_module()
{
  return({ MODULE_FILTER,
	   "Email Address Cloaking Device",
	   "<p>"
	     "This module mutilates mailto links for all web robots "
	     "scooping your site for mail addresses, rendering your mail "
	     "addresses a bit more spam safe without your using ugly "
	     "crippled mail addresses like foo@nospam.bar.baz etc."
	   "</p>\n<p>"
	     "The module accomplishes this by filtering all <tt>&lt;a href="
	     "\"mailto:...\"&gt;</tt> tags and all <tt>&lt;form method=POST"
	     " action=\"mailto:...\"&gt;</tt> tags when the browser is a "
	     "web robot (as detected by the Roxen supports database), "
	     "replacing those links with whatever the module is set up to. "
	     "No extra strain is put on the server when common visitors "
	     "view your pages; then, the module will just silently pass "
	     "the request on to other modules."
	   "</p><p>"
	     "More documentation for the module can be found at <a href=\""
	     "http://a205.ryd.student.liu.se/(Cloaking.Device)/docs/"
	     "\">the module author's site</a>."
	   "</p>", 0, 1 });
}

void create()
{
  set_module_creator("Johan Sundstrm");
  set_module_url("http://a205.ryd.student.liu.se/(Cloaking.Device)/docs/");

  defvar( "user", "substituted for a set string",
	  "Address' user part will be", TYPE_STRING_LIST,
	  "<p>This defines how the username part of the address will turn "
	  "out in the substituted mail addresses.</p>",
	  ({ "substituted for a set string",
	     "selected at random",
	     "left unchanged" }));

  defvar( "host", "substituted for a set string",
	  "Address' host part will be", TYPE_STRING_LIST,
	  "<p>This defines how the host part of the address will turn out "
	  "in the substituted mail addresses.</p>",
	  ({ "substituted for a set string",
	     "selected at random",
	     "left unchanged" }));

  defvar( "address", "abuse@127.0.0.1",
	  "Addresses will be substituted for", TYPE_STRING,
	  "<p>This is (at least almost) the address that search engines "
	  "will find instead of those you use in your documents. Exactly "
	  "how they turn out depends on the other settings.</p>"
	 );

  defvar( "contents_check", 1,
	  "Target plaintext of link as well", TYPE_FLAG,
	  "<p>This will not only change the href part of an <tt>&lt;a "
	  "href=\"mailto:\"" + EXAMPLE_ADDRESS + "&gt;" + EXAMPLE_ADDRESS
	  + "&lt;/a&gt;</tt> tag, but also its tag contents, when it "
	  "includes something that could be interpreted as an email "
	  "address (contains the character '@'). The contents of the "
	  "tag will then be substituted for the same address as was "
	  "the href argument.</p>"
	 );
}

string|void check_variable(string name, mixed value)
{
  string user = "", host = "";
  switch( name )
  {
    case "address":
      sscanf(value, "%s@%s", user, host);
      if(query( "user" ) == "substituted for a set string")
	mailuser = user;
      if(query( "host" ) == "substituted for a set string")
	mailhost = host;
      break;
  }
}

void start()
{
  check_variable("user", query( "user" ));
  check_variable("host", query( "host" ));
  check_variable("address", query( "address" ));
}

void stop()
{
}

string info(object conf)
{
  array def_info = (this->register_module(conf)[2])/"\n",
        examples = allocate(5);
  string example;
  multiset only_one_example = (< "substituted for a set string",
				 "left unchanged" >);
  if(only_one_example[QUERY(user)]
  && only_one_example[QUERY(host)])
    example = substitute(EXAMPLE_ADDRESS);
  else
  {
    for(int i = 0; i<5; i++)
      examples[i] = substitute(EXAMPLE_ADDRESS);
    example = "something like " + String.implode_nicely(examples, "or");
  }

  return sprintf("%s<p>Using current options, a typical address %s "
		 "would be turned into %s before being handed away "
		 "to a web-traversing robot.</p>\n%s",
		 def_info[0], EXAMPLE_ADDRESS, example, def_info[1]);
}

mixed container_link(string name, mapping arg, string contents, object id)
{
  string protocol, address;
  switch( name )
  {
    case "a":
      if(zero_type(arg->href)
      || lower_case(protocol = arg->href[..6]) != "mailto:")
	return 0;
      address = substitute(arg->href[7..]);
      arg->href = protocol + address;
      if(QUERY(contents_check)
      && (search(contents, "@") != -1))
	contents = address;
      break;

    case "form":
      if(zero_type(arg->action)
      || lower_case(arg->action[..6]) != "mailto:")
	return 0;
      arg->action = arg->action[..6] + substitute(arg->action[7..]);
      break;
  }
  return ({ make_container(name, arg, contents) });
}

mapping|void filter(mapping result, object id)
{
  if(!result               // If nobody had anything to say, neither do we.
  || result->type!="text/html"               //
  || sprintf("%t", result->data) != "string" // Only parse html.
  || !(id->supports->robot)                 // We only care about robots!
    )
    return result;

  result->data = parse_html( result->data, ([ ]),
			     ([ "a"	:container_link,
			        "form"	:container_link  ]), id );
  return result;
}
