boinc/lib/xml_util.C

// $Id$
// The contents of this file are subject to the BOINC Public License
// Version 1.0 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
// http://boinc.berkeley.edu/license_1.0.txt
//
// Software distributed under the License is distributed on an "AS IS"
// basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
// License for the specific language governing rights and limitations
// under the License.
//
// The Original Code is the Berkeley Open Infrastructure for Network Computing.
//
// The Initial Developer of the Original Code is the SETI@home project.
// Portions created by the SETI@home project are Copyright (C) 2002
// University of California at Berkeley. All Rights Reserved.
//
// Contributor(s):
//
// Revision History
// $Log$
// Revision 1.29  2004/04/05 20:09:41  korpela
// Rewrote extract_xml_record() to solve some problems...
//
// Revision 1.28  2004/03/06 09:45:25  rwalton
// *** empty log message ***
//
// Revision 1.27  2004/01/22 17:57:41  davea
// *** empty log message ***
//
// Revision 1.26  2004/01/20 02:51:50  korpela
// VC 7 mods
//
// Revision 1.25  2003/12/01 23:42:05  korpela
// Under some compilers template parameters of type char [] weren't getting
// cast to char *.  Template functions now use &(array[0]) to ensure correct
// type is used.
//
//
#include "config.h"

#include <cctype>
#include <vector>
#include <string>
#include <sstream>
#include <cstring>
#include <cstdio>

#include "std_fixes.h"
#include "xml_util.h"

int xml_indent_level=0;

std::string xml_indent(int i) {
  if (i) xml_indent_level+=i;
  xml_indent_level = (xml_indent_level>0) ? xml_indent_level : 0;
  return std::string(std::min(xml_indent_level,XML_MAX_INDENT),' ');
}
// Most of these entries are for reverse translation of poorly written HTML.
// Forward translation doesn't translate most printable characters.

const xml_entity xml_trans[]= {
  { 0x07, "&bel;" },
  { 0x0a, "&lf;" },
  { 0x0d, "&cr;" },
  { ' ', "&sp;" },
  { '!', "&excl;" },
  { '\"', "&quot;" },
  { '\"', "&dquot;" },
  { '#', "&num;" },
  { '$', "&dollar;" },
  { '%', "&percnt;" },
  { '&', "&amp;" },
  { '\'', "&apos;" },
  { '(', "&lpar;" },
  { ')', "&rpar;" },
  { '*', "&ast;" },
  { '+', "&plus;" },
  { ',', "&comma;" },
  { '-', "&hyphen;" },
  { '-', "&minus;" },
  { '.', "&period;" },
  { '/', "&sol;" },
  { ':', "&colon;" },
  { ';', "&semi;" },
  { '<', "&lt;" },
  { '=', "&equals;" },
  { '>', "&gt;" },
  { '?', "&quest;" },
  { '@', "&commat;" },
  { '[', "&lsqb;" },
  { '\\', "&bsol;" },
  { ']', "&rsqb;" },
  { '^', "&circ;" },
  { '_', "&lowbar;" },
  { '_', "&horbar;" },
  { '`', "&grave;" },
  { '{', "&lcub;" },
  { '|', "&verbar;" },
  { '}', "&rcub;" },
  { '~', "&tilde;" },
  { 0x82, "&lsquor;" },
  { 0x84, "&ldquor;" },
  { 0x85, "&ldots;" },
  { 0x8a, "&Scaron;" },
  { 0x8b, "&lsaquo;" },
  { 0x8c, "&OElig;" },
  { 0x91, "&lsquo;" },
  { 0x91, "&rsquor;" },
  { 0x92, "&rsquo;" },
  { 0x93, "&ldquo;" },
  { 0x93, "&rdquor;" },
  { 0x94, "&rdquo;" },
  { 0x95, "&bull;" },
  { 0x96, "&ndash;" },
  { 0x96, "&endash;" },
  { 0x97, "&mdash;" },
  { 0x97, "&emdash;" },
  { 0xa0, "&nbsp;" },
  { 0xa1, "&iexcl;" },
  { 0xa2, "&cent;" },
  { 0xa3, "&pound;" },
  { 0xa4, "&curren;" },
  { 0xa5, "&yen;" },
  { 0xa6, "&brvbar;" },
  { 0xa7, "&sect;" },
  { 0xa8, "&uml;" },
  { 0xa9, "&copy;" },
  { 0xaa, "&ordf;" },
  { 0xab, "&laquo;" },
  { 0xac, "&not;" },
  { 0xad, "&shy;" },
  { 0xae, "&reg;" },
  { 0xaf, "&macr;" },
  { 0xb0, "&deg;" },
  { 0xb1, "&plusmn;" },
  { 0xb2, "&sup2;" },
  { 0xb3, "&sup3;" },
  { 0xb4, "&acute;" },
  { 0xb5, "&micro;" },
  { 0xb6, "&para;" },
  { 0xb7, "&middot;" },
  { 0xb8, "&cedil;" },
  { 0xb9, "&sup1;" },
  { 0xba, "&ordm;" },
  { 0xbb, "&raquo;" },
  { 0xbc, "&frac14;" },
  { 0xbd, "&frac12;" },
  { 0xbe, "&frac34;" },
  { 0xbf, "&iquest;" },
  { 0xc0, "&Agrave;" },
  { 0xc1, "&Aacute;" },
  { 0xc2, "&Acirc;" },
  { 0xc3, "&Atilde;" },
  { 0xc4, "&Auml;" },
  { 0xc5, "&Aring;" },
  { 0xc6, "&AElig;" },
  { 0xc7, "&Ccedil;" },
  { 0xc8, "&Egrave;" },
  { 0xc9, "&Eacute;" },
  { 0xca, "&Ecirc;" },
  { 0xcb, "&Euml;" },
  { 0xcc, "&Igrave;" },
  { 0xcd, "&Iacute;" },
  { 0xce, "&Icirc;" },
  { 0xcf, "&Iuml;" },
  { 0xd0, "&ETH;" },
  { 0xd1, "&Ntilde;" },
  { 0xd2, "&Ograve;" },
  { 0xd3, "&Oacute;" },
  { 0xd4, "&Ocirc;" },
  { 0xd5, "&Otilde;" },
  { 0xd6, "&Ouml;" },
  { 0xd7, "&times;" },
  { 0xd8, "&Oslash;" },
  { 0xd9, "&Ugrave;" },
  { 0xda, "&Uacute;" },
  { 0xdb, "&Ucirc;" },
  { 0xdc, "&Uuml;" },
  { 0xdd, "&Yacute;" },
  { 0xde, "&THORN;" },
  { 0xdf, "&szlig;" },
  { 0xe0, "&agrave;" },
  { 0xe1, "&aacute;" },
  { 0xe2, "&acirc;" },
  { 0xe3, "&atilde;" },
  { 0xe4, "&auml;" },
  { 0xe5, "&aring;" },
  { 0xe6, "&aelig;" },
  { 0xe7, "&ccedil;" },
  { 0xe8, "&egrave;" },
  { 0xe9, "&eacute;" },
  { 0xea, "&ecirc;" },
  { 0xeb, "&euml;" },
  { 0xec, "&igrave;" },
  { 0xed, "&iacute;" },
  { 0xee, "&icirc;" },
  { 0xef, "&iuml;" },
  { 0xf0, "&eth;" },
  { 0xf1, "&ntilde;" },
  { 0xf2, "&ograve;" },
  { 0xf3, "&oacute;" },
  { 0xf4, "&ocirc;" },
  { 0xf5, "&otilde;" },
  { 0xf6, "&ouml;" },
  { 0xf7, "&divide;" },
  { 0xf8, "&oslash;" },
  { 0xf9, "&ugrave;" },
  { 0xfa, "&uacute;" },
  { 0xfb, "&ucirc;" },
  { 0xfc, "&uuml;" },
  { 0xfd, "&yacute;" },
  { 0xfe, "&thorn;" },
  { 0xff, "&yuml;" },
  { 0x00, 0 }
};

#if 0
xml_ofstream::xml_ofstream() : my_tag(), os()  {}

xml_ofstream::xml_ofstream(const char *filename, const char *tag,
    std::ios_base::openmode m) : , my_tag(tag), os(filename,m)
{
  if (is_open()) {
    write_head();
  }
}

xml_ostream::xml_ostream(std::ostream &o, const char *tag)
  : my_tag(tag), os(o)
{
  write_head();
}

xml_ostream::~xml_ostream() {
  write_foot();
}

xml_ofstream::~xml_ofstream() {
  close();
}

void xml_ofstream::open(const char *filename, const char *tag,
    std::ios_base::openmode m) {
  my_tag=std::string(tag);
  os.open(filename,m);
  if (is_open()) {
    write_head();
  }
}

void xml_ofstream::close() {
  write_foot();
  os.close();
}

void xml_ostream::write_head() {
  xml_indent_level=0;
  os << xml_header << std::endl;
  os << '<' << my_tag << '>' << std::endl;
  xml_indent(2);
}

void xml_ofstream::write_head() {
  xml_indent_level=0;
  os << xml_header << std::endl;
  os << '<' << my_tag << '>' << std::endl;
  xml_indent(2);
}

void xml_ostream::write_foot() {
  xml_indent(-2);
  os << "</" << my_tag << '>' << std::endl;
}

void xml_ofstream::write_foot() {
  xml_indent(-2);
  os << "</" << my_tag << '>' << std::endl;
}

xml_ifstream::xml_ifstream() : , my_tag(""), xml_start(0), ifs()
  xml_end(0) {}

xml_ifstream::xml_ifstream(const char *filename, const char *tag,
    std::ios_base::openmode m) : std::ifstream(filename,m), my_tag(tag),
    xml_start(0), xml_end(0) {
  if (is_open()) {
    seek_head();
  }
}

xml_istream::xml_istream(std::istream &i, const char *tag)
  : my_tag(tag), is(i) {
}

xml_ifstream::~xml_ifstream() {
  close();
}

void xml_ifstream::open(const char *filename, const char *tag,
    std::ios_base::openmode m) {
  my_tag=std::string(tag);
  std::ifstream::open(filename,m);
  if (is_open()) {
    seek_head();
  }
}

void xml_istream::seek_head() {
  std::string tmp;
  char c;
  unsigned int i=0;
  bool start_found=false;
  if (my_tag.size()) {
    while (is) {
        is.get(c);
	if (c=='<') {
	  do {
	    is.get(c);
	    i++;
	  } while (c == my_tag[i-1]);
	  if ((i==my_tag.size()) && !isalnum(c)) {
	    start_found=true;
	    break;
	  }
	}
    }
  } else {
    while (is) {
      is.get(c);
      if (c=='<') {
	do {
	  is.get(c);
          if (isalnum(c)) my_tag+=c;
	} while (isalnum(c));
      }
      if (my_tag.size()) {
	start_found=true;
	break;
      }
    }
  }
  if (start_found) {
    while ((c != '>') && is) is.get(c);
  }
}


void xml_ifstream::seek_head() {
  if (!xml_start) {
    std::string tmp;
    std::string::size_type tag_start, tag_end;
    bool start_found=false;
    std::ifstream::seekg(0,std::ios::beg);
    if (my_tag.size()) {
      do {
        *this >> tmp;
        if ((tag_start=tmp.find(std::string("<")+my_tag)) != std::string::npos) {
	  tag_start=tmp.find('>');
	  std::ifstream::seekg(tag_start-tmp.size()+my_tag.size()+2,std::ios::cur);
	  start_found=true;
        } else {
          if ((tag_start=tmp.find("<")) != std::string::npos) {
	    if (isalpha(tmp[tag_start+1])) {
              while (isalnum(tmp[++tag_start])) my_tag+=tmp[tag_start];
	      start_found=true;
	      tag_start=tmp.find(">",tag_start-1);
	      std::ifstream::seekg(tag_start-tmp.size(),std::ios::cur);
	    }
          }
        }
      } while (!start_found && !std::ifstream::eof());
      xml_start=std::ifstream::tellg();
    }
    if (my_tag.size()) {
      int nstarts=1;
      std::string start_tag(std::string("<")+my_tag);
      std::string end_tag(std::string("</")+my_tag);
      do {
	*this >> tmp;
	if (tmp.find(start_tag)!=std::string::npos) {
	  nstarts++;
	}
	if ((tag_end=tmp.find(end_tag))!=std::string::npos) {
	  nstarts--;
	}
      } while (nstarts && !std::ifstream::eof());
      std::ifstream::seekg(tag_end-tmp.size(),std::ios::cur);
      xml_end=std::ifstream::tellg();
    }
  }
  if (xml_start) std::ifstream::seekg(xml_start,std::ios::beg);
}

xml_ifstream &xml_ifstream::seekg(pos_type p) {
  if (xml_start) std::ifstream::seekg(xml_start+p);
  return *this;
}

xml_ifstream &xml_ifstream::seekg(off_type o, std::ios::seekdir d) {
  switch (d) {
    case std::ios::beg:
      seekg(o);
      break;
    case std::ios::end:
      std::ifstream::seekg(xml_end+o);
      break;
    default:
      std::ifstream::seekg(o,d);
      break;
  }
  return *this;
}

std::ios::pos_type xml_ifstream::tellg() {
  return std::ifstream::tellg()-xml_start;
}

bool xml_ifstream::eof() {
  if (std::ifstream::tellg() >= xml_end) {
    return true;
  } else {
    return std::ifstream::tellg();
  }
}
#endif  // 0

#ifdef HAVE_MAP
#include <map>

std::multimap<unsigned char,const char *> encode_map;
std::map<std::string, unsigned char> decode_map;

void populate_encode_map() {
  int i=0;
  do {
    encode_map.insert(std::make_pair(xml_trans[i].c,xml_trans[i].s));
  } while (xml_trans[++i].s);
}

void populate_decode_map() {
  int i=0;
  do {
    decode_map[xml_trans[i].s]=xml_trans[i].c;
  } while (xml_trans[++i].s);
}
#endif

const char * encode_arr="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

const char * encode_arr85=
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxy!#$()*+,-./:;=?@^`{|}~z_";


bool isencchar(char c) {
  bool rv=((c>='A') && (c<='Z'));
  rv|=((c>='a') && (c<='z'));
  rv|=((c>='0') && (c<='9'));
  rv|=((c=='+') || (c=='/') || (c=='='));
  return rv;
}

bool isencchar85(char c) {
  bool rv=((c>='A') && (c<='Z'));
  rv|=((c>='a') && (c<='z'));
  rv|=((c>='0') && (c<='9'));
  switch (c) {
    case '!':
    case '#':
    case '$':
    case '(':
    case ')':
    case '*':
    case '+':
    case ',':
    case '-':
    case '.':
    case '/':
    case ':':
    case ';':
    case '=':
    case '?':
    case '@':
    case '^':
    case '`':
    case '{':
    case '|':
    case '}':
    case '~':
    case '_':
      rv=true;
      break;
    default:
      break;
  }
  return rv;
}


std::string encode_char(unsigned char c) {
#ifdef HAVE_MAP
  if (!(encode_map.size())) populate_encode_map();
  std::multimap<unsigned char,const char *>::iterator p=encode_map.find(c);
  if (p!=encode_map.end()) {
    return (p->second);
  } else {
#else
  int i=0;
  while (xml_trans[i].s) {
    if (xml_trans[i].c == c) return std::string(xml_trans[i].s);
    i++;
  }
  {
#endif
    char buf[16];
    sprintf(buf,"&#%.3d;",static_cast<int>(c));
#ifdef HAVE_MAP
    encode_map.insert(std::make_pair(c,&(buf[0])));
#endif
    return std::string(buf);
  }
}

unsigned char decode_char(const char *s) {
  char code[32];
  int i=0;
  code[31]=0;
  while (*s && (*s != ';') && i<31) {
    code[i]=*s;
    s++;
    i++;
  }
  code[i]=';';
  code[i+1]=0;
#ifdef HAVE_MAP
  if (!(decode_map.size())) populate_decode_map();
  std::map<std::string,unsigned char>::iterator p=decode_map.find(code);
  if (p!=decode_map.end()) {
    return (p->second);
  } else {
#else
  while (xml_trans[i].s) {
    if (!strcmp(xml_trans[i].s,(const char *)(&code[0]))) return xml_trans[i].c;
    i++;
  }
  {
#endif
    if (code[1]=='#') {
      sscanf((const char *)(code+2),"%d",&i);
#ifdef HAVE_MAP
      decode_map.insert(std::make_pair(std::string(code),static_cast<unsigned char>(i&0xff)));
#endif
    } else {
      fprintf(stderr,"Unknown XML entity \"%s\"\n",code);
      i='&';
    }
    return static_cast<unsigned char>(i&0xff);
  }
}

std::string x_csv_encode_char(const unsigned char *bin, size_t nelements) {
  std::ostringstream rv("");
  long lastlen=0;
  size_t i;
  rv << std::endl << xml_indent(2);
  for (i=0;i<(nelements-1);i++) {
    unsigned int ival=bin[i];
    rv << ival << ',';
    if ((static_cast<int>(rv.str().size())-lastlen-std::min(xml_indent_level,XML_MAX_INDENT))>73) {
      rv << std::endl << xml_indent();
      lastlen=(long)rv.str().size();
    }
  }
  unsigned int ival=bin[i];
  rv << ival << std::endl << xml_indent(-2);
  return rv.str();
}

// test if a character is an xml tag delimiter
bool isxmldelim(char c) {
  return ((c==' ') || (c=='\n') || (c=='\r') ||
          (c==',') || (c=='<') || (c=='>') ||
	  (c==0));
}

// return true if the tag appears in the line
//
bool xml_match_tag(const char* buf, const char* tag) {
    char tmp_tag[BUFSIZ]={'<',0};
    if (tag[0] == '<') {
      strlcpy(tmp_tag,tag,BUFSIZ);
    } else {
      strlcat(tmp_tag,tag,BUFSIZ);
    }
    char *p=tmp_tag+strlen(tmp_tag);
    do {
      *(p--)=0;
    } while (isxmldelim(*p));
    while ((buf=strstr(buf,tmp_tag))) {
      if (isxmldelim(buf[strlen(tmp_tag)])) return true;
      buf++;
    }
    return false;
}

bool xml_match_tag(const std::string &s, const char* tag) {
  return xml_match_tag(s.c_str(),tag);
}

size_t xml_find_tag(const char* buf, const char* tag) {
    const char *buf0=buf;
    char tmp_tag[BUFSIZ]={'<',0};
    if (tag[0] == '<') {
      strlcpy(tmp_tag,tag,BUFSIZ);
    } else {
      strlcat(tmp_tag,tag,BUFSIZ);
    }
    char *p=tmp_tag+strlen(tmp_tag);
    do {
      *(p--)=0;
    } while (isxmldelim(*p));
    while ((buf=strstr(buf,tmp_tag))) {
      if (isxmldelim(buf[strlen(tmp_tag)])) return buf-buf0;
      buf++;
    }
    return strlen(buf0);
}

std::string::size_type xml_find_tag(const std::string &s, const char* tag) {
  std::string::size_type p=xml_find_tag(s.c_str(),tag);
  return (p!=strlen(s.c_str()))?p:(std::string::npos);
}

bool extract_xml_record(const std::string &field, const char *tag, std::string &record) {
    char end_tag[256];
    sprintf(end_tag,"/%s",tag);
    std::string::size_type j,k;

    // find the start_tag
    j=xml_find_tag(field,tag);
    if (j==std::string::npos) return false;
    // find the end tag
    k=xml_find_tag(std::string(field,j,field.length()-j),end_tag);
    if (k==std::string::npos) return false;

    record=std::string(field,j,k+strlen(end_tag)+1);
    return true;
}

//
// $Log$
// Revision 1.29  2004/04/05 20:09:41  korpela
// Rewrote extract_xml_record() to solve some problems...
//
// Revision 1.28  2004/03/06 09:45:25  rwalton
// *** empty log message ***
//
// Revision 1.27  2004/01/22 17:57:41  davea
// *** empty log message ***
//
// Revision 1.26  2004/01/20 02:51:50  korpela
// VC 7 mods
//
// Revision 1.25  2003/12/01 23:42:05  korpela
// Under some compilers template parameters of type char [] weren't getting
// cast to char *.  Template functions now use &(array[0]) to ensure correct
// type is used.
//
// Revision 1.24  2003/11/11 17:29:01  quarl
// *** empty log message ***
//
// Revision 1.23  2003/10/29 20:08:49  korpela
// *** empty log message ***
//
// Revision 1.22  2003/10/27 23:07:34  korpela
// *** empty log message ***
//
// Revision 1.21  2003/10/27 20:07:11  korpela
// *** empty log message ***
//
// Revision 1.20  2003/10/27 19:41:23  korpela
//
// Fixed potential buffer overrun in decode_char()
//
// Revision 1.19  2003/10/27 17:52:49  korpela
// *** empty log message ***
//
// Revision 1.18  2003/10/24 16:58:10  korpela
// *** empty log message ***
//
// Revision 1.17  2003/10/24 00:05:02  davea
// *** empty log message ***
//
// Revision 1.16  2003/10/23 19:58:20  jeffc
// jeffc - bug fix in csv encode routine
//
// Revision 1.15  2003/10/23 19:18:38  jeffc
// jeffc - put back in line feeds - no longer using parese_str().
//
// Revision 1.14  2003/10/23 15:39:54  korpela
// no message
//
// Revision 1.13  2003/10/23 00:25:15  jeffc
// jeffc - no line feeds in CSV encoding
//
// Revision 1.12  2003/10/22 18:23:23  korpela
// *** empty log message ***
//
// Revision 1.11  2003/10/22 18:01:41  korpela
// *** empty log message ***
//
// Revision 1.10  2003/10/22 15:24:10  korpela
// *** empty log message ***
//
// Revision 1.9  2003/10/21 18:14:36  korpela
// *** empty log message ***
//
//