// $Id$ // The contents of this file are subject to the BOINC Public License // Version 1.0 (the "License"); you may not use this file except in // compliance with the License. You may obtain a copy of the License at // http://boinc.berkeley.edu/license_1.0.txt // // Software distributed under the License is distributed on an "AS IS" // basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the // License for the specific language governing rights and limitations // under the License. // // The Original Code is the Berkeley Open Infrastructure for Network Computing. // // The Initial Developer of the Original Code is the SETI@home project. // Portions created by the SETI@home project are Copyright (C) 2002 // University of California at Berkeley. All Rights Reserved. // // Contributor(s): // // Revision History // $Log$ // Revision 1.29 2004/04/05 20:09:41 korpela // Rewrote extract_xml_record() to solve some problems... // // Revision 1.28 2004/03/06 09:45:25 rwalton // *** empty log message *** // // Revision 1.27 2004/01/22 17:57:41 davea // *** empty log message *** // // Revision 1.26 2004/01/20 02:51:50 korpela // VC 7 mods // // Revision 1.25 2003/12/01 23:42:05 korpela // Under some compilers template parameters of type char [] weren't getting // cast to char *. Template functions now use &(array[0]) to ensure correct // type is used. // // #include "config.h" #include #include #include #include #include #include #include "std_fixes.h" #include "xml_util.h" int xml_indent_level=0; std::string xml_indent(int i) { if (i) xml_indent_level+=i; xml_indent_level = (xml_indent_level>0) ? xml_indent_level : 0; return std::string(std::min(xml_indent_level,XML_MAX_INDENT),' '); } // Most of these entries are for reverse translation of poorly written HTML. // Forward translation doesn't translate most printable characters. const xml_entity xml_trans[]= { { 0x07, "&bel;" }, { 0x0a, "&lf;" }, { 0x0d, "&cr;" }, { ' ', "&sp;" }, { '!', "!" }, { '\"', """ }, { '\"', "&dquot;" }, { '#', "#" }, { '$', "$" }, { '%', "%" }, { '&', "&" }, { '\'', "'" }, { '(', "(" }, { ')', ")" }, { '*', "*" }, { '+', "+" }, { ',', "," }, { '-', "‐" }, { '-', "−" }, { '.', "." }, { '/', "/" }, { ':', ":" }, { ';', ";" }, { '<', "<" }, { '=', "=" }, { '>', ">" }, { '?', "?" }, { '@', "@" }, { '[', "[" }, { '\\', "\" }, { ']', "]" }, { '^', "ˆ" }, { '_', "_" }, { '_', "―" }, { '`', "`" }, { '{', "{" }, { '|', "|" }, { '}', "}" }, { '~', "˜" }, { 0x82, "‚" }, { 0x84, "„" }, { 0x85, "&ldots;" }, { 0x8a, "Š" }, { 0x8b, "‹" }, { 0x8c, "Œ" }, { 0x91, "‘" }, { 0x91, "’" }, { 0x92, "’" }, { 0x93, "“" }, { 0x93, "”" }, { 0x94, "”" }, { 0x95, "•" }, { 0x96, "–" }, { 0x96, "&endash;" }, { 0x97, "—" }, { 0x97, "&emdash;" }, { 0xa0, " " }, { 0xa1, "¡" }, { 0xa2, "¢" }, { 0xa3, "£" }, { 0xa4, "¤" }, { 0xa5, "¥" }, { 0xa6, "¦" }, { 0xa7, "§" }, { 0xa8, "¨" }, { 0xa9, "©" }, { 0xaa, "ª" }, { 0xab, "«" }, { 0xac, "¬" }, { 0xad, "­" }, { 0xae, "®" }, { 0xaf, "¯" }, { 0xb0, "°" }, { 0xb1, "±" }, { 0xb2, "²" }, { 0xb3, "³" }, { 0xb4, "´" }, { 0xb5, "µ" }, { 0xb6, "¶" }, { 0xb7, "·" }, { 0xb8, "¸" }, { 0xb9, "¹" }, { 0xba, "º" }, { 0xbb, "»" }, { 0xbc, "¼" }, { 0xbd, "½" }, { 0xbe, "¾" }, { 0xbf, "¿" }, { 0xc0, "À" }, { 0xc1, "Á" }, { 0xc2, "Â" }, { 0xc3, "Ã" }, { 0xc4, "Ä" }, { 0xc5, "Å" }, { 0xc6, "Æ" }, { 0xc7, "Ç" }, { 0xc8, "È" }, { 0xc9, "É" }, { 0xca, "Ê" }, { 0xcb, "Ë" }, { 0xcc, "Ì" }, { 0xcd, "Í" }, { 0xce, "Î" }, { 0xcf, "Ï" }, { 0xd0, "Ð" }, { 0xd1, "Ñ" }, { 0xd2, "Ò" }, { 0xd3, "Ó" }, { 0xd4, "Ô" }, { 0xd5, "Õ" }, { 0xd6, "Ö" }, { 0xd7, "×" }, { 0xd8, "Ø" }, { 0xd9, "Ù" }, { 0xda, "Ú" }, { 0xdb, "Û" }, { 0xdc, "Ü" }, { 0xdd, "Ý" }, { 0xde, "Þ" }, { 0xdf, "ß" }, { 0xe0, "à" }, { 0xe1, "á" }, { 0xe2, "â" }, { 0xe3, "ã" }, { 0xe4, "ä" }, { 0xe5, "å" }, { 0xe6, "æ" }, { 0xe7, "ç" }, { 0xe8, "è" }, { 0xe9, "é" }, { 0xea, "ê" }, { 0xeb, "ë" }, { 0xec, "ì" }, { 0xed, "í" }, { 0xee, "î" }, { 0xef, "ï" }, { 0xf0, "ð" }, { 0xf1, "ñ" }, { 0xf2, "ò" }, { 0xf3, "ó" }, { 0xf4, "ô" }, { 0xf5, "õ" }, { 0xf6, "ö" }, { 0xf7, "÷" }, { 0xf8, "ø" }, { 0xf9, "ù" }, { 0xfa, "ú" }, { 0xfb, "û" }, { 0xfc, "ü" }, { 0xfd, "ý" }, { 0xfe, "þ" }, { 0xff, "ÿ" }, { 0x00, 0 } }; #if 0 xml_ofstream::xml_ofstream() : my_tag(), os() {} xml_ofstream::xml_ofstream(const char *filename, const char *tag, std::ios_base::openmode m) : , my_tag(tag), os(filename,m) { if (is_open()) { write_head(); } } xml_ostream::xml_ostream(std::ostream &o, const char *tag) : my_tag(tag), os(o) { write_head(); } xml_ostream::~xml_ostream() { write_foot(); } xml_ofstream::~xml_ofstream() { close(); } void xml_ofstream::open(const char *filename, const char *tag, std::ios_base::openmode m) { my_tag=std::string(tag); os.open(filename,m); if (is_open()) { write_head(); } } void xml_ofstream::close() { write_foot(); os.close(); } void xml_ostream::write_head() { xml_indent_level=0; os << xml_header << std::endl; os << '<' << my_tag << '>' << std::endl; xml_indent(2); } void xml_ofstream::write_head() { xml_indent_level=0; os << xml_header << std::endl; os << '<' << my_tag << '>' << std::endl; xml_indent(2); } void xml_ostream::write_foot() { xml_indent(-2); os << "' << std::endl; } void xml_ofstream::write_foot() { xml_indent(-2); os << "' << std::endl; } xml_ifstream::xml_ifstream() : , my_tag(""), xml_start(0), ifs() xml_end(0) {} xml_ifstream::xml_ifstream(const char *filename, const char *tag, std::ios_base::openmode m) : std::ifstream(filename,m), my_tag(tag), xml_start(0), xml_end(0) { if (is_open()) { seek_head(); } } xml_istream::xml_istream(std::istream &i, const char *tag) : my_tag(tag), is(i) { } xml_ifstream::~xml_ifstream() { close(); } void xml_ifstream::open(const char *filename, const char *tag, std::ios_base::openmode m) { my_tag=std::string(tag); std::ifstream::open(filename,m); if (is_open()) { seek_head(); } } void xml_istream::seek_head() { std::string tmp; char c; unsigned int i=0; bool start_found=false; if (my_tag.size()) { while (is) { is.get(c); if (c=='<') { do { is.get(c); i++; } while (c == my_tag[i-1]); if ((i==my_tag.size()) && !isalnum(c)) { start_found=true; break; } } } } else { while (is) { is.get(c); if (c=='<') { do { is.get(c); if (isalnum(c)) my_tag+=c; } while (isalnum(c)); } if (my_tag.size()) { start_found=true; break; } } } if (start_found) { while ((c != '>') && is) is.get(c); } } void xml_ifstream::seek_head() { if (!xml_start) { std::string tmp; std::string::size_type tag_start, tag_end; bool start_found=false; std::ifstream::seekg(0,std::ios::beg); if (my_tag.size()) { do { *this >> tmp; if ((tag_start=tmp.find(std::string("<")+my_tag)) != std::string::npos) { tag_start=tmp.find('>'); std::ifstream::seekg(tag_start-tmp.size()+my_tag.size()+2,std::ios::cur); start_found=true; } else { if ((tag_start=tmp.find("<")) != std::string::npos) { if (isalpha(tmp[tag_start+1])) { while (isalnum(tmp[++tag_start])) my_tag+=tmp[tag_start]; start_found=true; tag_start=tmp.find(">",tag_start-1); std::ifstream::seekg(tag_start-tmp.size(),std::ios::cur); } } } } while (!start_found && !std::ifstream::eof()); xml_start=std::ifstream::tellg(); } if (my_tag.size()) { int nstarts=1; std::string start_tag(std::string("<")+my_tag); std::string end_tag(std::string("> tmp; if (tmp.find(start_tag)!=std::string::npos) { nstarts++; } if ((tag_end=tmp.find(end_tag))!=std::string::npos) { nstarts--; } } while (nstarts && !std::ifstream::eof()); std::ifstream::seekg(tag_end-tmp.size(),std::ios::cur); xml_end=std::ifstream::tellg(); } } if (xml_start) std::ifstream::seekg(xml_start,std::ios::beg); } xml_ifstream &xml_ifstream::seekg(pos_type p) { if (xml_start) std::ifstream::seekg(xml_start+p); return *this; } xml_ifstream &xml_ifstream::seekg(off_type o, std::ios::seekdir d) { switch (d) { case std::ios::beg: seekg(o); break; case std::ios::end: std::ifstream::seekg(xml_end+o); break; default: std::ifstream::seekg(o,d); break; } return *this; } std::ios::pos_type xml_ifstream::tellg() { return std::ifstream::tellg()-xml_start; } bool xml_ifstream::eof() { if (std::ifstream::tellg() >= xml_end) { return true; } else { return std::ifstream::tellg(); } } #endif // 0 #ifdef HAVE_MAP #include std::multimap encode_map; std::map decode_map; void populate_encode_map() { int i=0; do { encode_map.insert(std::make_pair(xml_trans[i].c,xml_trans[i].s)); } while (xml_trans[++i].s); } void populate_decode_map() { int i=0; do { decode_map[xml_trans[i].s]=xml_trans[i].c; } while (xml_trans[++i].s); } #endif const char * encode_arr="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; const char * encode_arr85= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxy!#$()*+,-./:;=?@^`{|}~z_"; bool isencchar(char c) { bool rv=((c>='A') && (c<='Z')); rv|=((c>='a') && (c<='z')); rv|=((c>='0') && (c<='9')); rv|=((c=='+') || (c=='/') || (c=='=')); return rv; } bool isencchar85(char c) { bool rv=((c>='A') && (c<='Z')); rv|=((c>='a') && (c<='z')); rv|=((c>='0') && (c<='9')); switch (c) { case '!': case '#': case '$': case '(': case ')': case '*': case '+': case ',': case '-': case '.': case '/': case ':': case ';': case '=': case '?': case '@': case '^': case '`': case '{': case '|': case '}': case '~': case '_': rv=true; break; default: break; } return rv; } std::string encode_char(unsigned char c) { #ifdef HAVE_MAP if (!(encode_map.size())) populate_encode_map(); std::multimap::iterator p=encode_map.find(c); if (p!=encode_map.end()) { return (p->second); } else { #else int i=0; while (xml_trans[i].s) { if (xml_trans[i].c == c) return std::string(xml_trans[i].s); i++; } { #endif char buf[16]; sprintf(buf,"&#%.3d;",static_cast(c)); #ifdef HAVE_MAP encode_map.insert(std::make_pair(c,&(buf[0]))); #endif return std::string(buf); } } unsigned char decode_char(const char *s) { char code[32]; int i=0; code[31]=0; while (*s && (*s != ';') && i<31) { code[i]=*s; s++; i++; } code[i]=';'; code[i+1]=0; #ifdef HAVE_MAP if (!(decode_map.size())) populate_decode_map(); std::map::iterator p=decode_map.find(code); if (p!=decode_map.end()) { return (p->second); } else { #else while (xml_trans[i].s) { if (!strcmp(xml_trans[i].s,(const char *)(&code[0]))) return xml_trans[i].c; i++; } { #endif if (code[1]=='#') { sscanf((const char *)(code+2),"%d",&i); #ifdef HAVE_MAP decode_map.insert(std::make_pair(std::string(code),static_cast(i&0xff))); #endif } else { fprintf(stderr,"Unknown XML entity \"%s\"\n",code); i='&'; } return static_cast(i&0xff); } } std::string x_csv_encode_char(const unsigned char *bin, size_t nelements) { std::ostringstream rv(""); long lastlen=0; size_t i; rv << std::endl << xml_indent(2); for (i=0;i<(nelements-1);i++) { unsigned int ival=bin[i]; rv << ival << ','; if ((static_cast(rv.str().size())-lastlen-std::min(xml_indent_level,XML_MAX_INDENT))>73) { rv << std::endl << xml_indent(); lastlen=(long)rv.str().size(); } } unsigned int ival=bin[i]; rv << ival << std::endl << xml_indent(-2); return rv.str(); } // test if a character is an xml tag delimiter bool isxmldelim(char c) { return ((c==' ') || (c=='\n') || (c=='\r') || (c==',') || (c=='<') || (c=='>') || (c==0)); } // return true if the tag appears in the line // bool xml_match_tag(const char* buf, const char* tag) { char tmp_tag[BUFSIZ]={'<',0}; if (tag[0] == '<') { strlcpy(tmp_tag,tag,BUFSIZ); } else { strlcat(tmp_tag,tag,BUFSIZ); } char *p=tmp_tag+strlen(tmp_tag); do { *(p--)=0; } while (isxmldelim(*p)); while ((buf=strstr(buf,tmp_tag))) { if (isxmldelim(buf[strlen(tmp_tag)])) return true; buf++; } return false; } bool xml_match_tag(const std::string &s, const char* tag) { return xml_match_tag(s.c_str(),tag); } size_t xml_find_tag(const char* buf, const char* tag) { const char *buf0=buf; char tmp_tag[BUFSIZ]={'<',0}; if (tag[0] == '<') { strlcpy(tmp_tag,tag,BUFSIZ); } else { strlcat(tmp_tag,tag,BUFSIZ); } char *p=tmp_tag+strlen(tmp_tag); do { *(p--)=0; } while (isxmldelim(*p)); while ((buf=strstr(buf,tmp_tag))) { if (isxmldelim(buf[strlen(tmp_tag)])) return buf-buf0; buf++; } return strlen(buf0); } std::string::size_type xml_find_tag(const std::string &s, const char* tag) { std::string::size_type p=xml_find_tag(s.c_str(),tag); return (p!=strlen(s.c_str()))?p:(std::string::npos); } bool extract_xml_record(const std::string &field, const char *tag, std::string &record) { char end_tag[256]; sprintf(end_tag,"/%s",tag); std::string::size_type j,k; // find the start_tag j=xml_find_tag(field,tag); if (j==std::string::npos) return false; // find the end tag k=xml_find_tag(std::string(field,j,field.length()-j),end_tag); if (k==std::string::npos) return false; record=std::string(field,j,k+strlen(end_tag)+1); return true; } // // $Log$ // Revision 1.29 2004/04/05 20:09:41 korpela // Rewrote extract_xml_record() to solve some problems... // // Revision 1.28 2004/03/06 09:45:25 rwalton // *** empty log message *** // // Revision 1.27 2004/01/22 17:57:41 davea // *** empty log message *** // // Revision 1.26 2004/01/20 02:51:50 korpela // VC 7 mods // // Revision 1.25 2003/12/01 23:42:05 korpela // Under some compilers template parameters of type char [] weren't getting // cast to char *. Template functions now use &(array[0]) to ensure correct // type is used. // // Revision 1.24 2003/11/11 17:29:01 quarl // *** empty log message *** // // Revision 1.23 2003/10/29 20:08:49 korpela // *** empty log message *** // // Revision 1.22 2003/10/27 23:07:34 korpela // *** empty log message *** // // Revision 1.21 2003/10/27 20:07:11 korpela // *** empty log message *** // // Revision 1.20 2003/10/27 19:41:23 korpela // // Fixed potential buffer overrun in decode_char() // // Revision 1.19 2003/10/27 17:52:49 korpela // *** empty log message *** // // Revision 1.18 2003/10/24 16:58:10 korpela // *** empty log message *** // // Revision 1.17 2003/10/24 00:05:02 davea // *** empty log message *** // // Revision 1.16 2003/10/23 19:58:20 jeffc // jeffc - bug fix in csv encode routine // // Revision 1.15 2003/10/23 19:18:38 jeffc // jeffc - put back in line feeds - no longer using parese_str(). // // Revision 1.14 2003/10/23 15:39:54 korpela // no message // // Revision 1.13 2003/10/23 00:25:15 jeffc // jeffc - no line feeds in CSV encoding // // Revision 1.12 2003/10/22 18:23:23 korpela // *** empty log message *** // // Revision 1.11 2003/10/22 18:01:41 korpela // *** empty log message *** // // Revision 1.10 2003/10/22 15:24:10 korpela // *** empty log message *** // // Revision 1.9 2003/10/21 18:14:36 korpela // *** empty log message *** // //