// Berkeley Open Infrastructure for Network Computing // http://boinc.berkeley.edu // Copyright (C) 2005 University of California // // This is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; // either version 2.1 of the License, or (at your option) any later version. // // This software is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. // See the GNU Lesser General Public License for more details. // // To view the GNU Lesser General Public License visit // http://www.gnu.org/copyleft/lesser.html // or write to the Free Software Foundation, Inc., // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA // A very crude interface for parsing XML files; // assumes all elements are either single-line or // have start and end tags on separate lines. // This is meant to be used ONLY for parsing XML files produced // by the BOINC scheduling server or client. // Could replace this with a more general parser. #if defined(_WIN32) && !defined(__STDWX_H__) && !defined(_BOINC_WIN_) && !defined(_AFX_STDAFX_H_) #include "boinc_win.h" #endif #ifndef _WIN32 #include "config.h" #include #include #include #include #if HAVE_IEEEFP_H #include #endif #endif #include "error_numbers.h" #include "str_util.h" #include "parse.h" #ifdef _USING_FCGI_ #include "fcgi_stdio.h" #endif using std::string; // Parse a boolean; tag is of form "foobar" // Accept either or 0|1 // bool parse_bool(const char* buf, const char* tag, bool& result) { char single_tag[256], start_tag[256]; int x; sprintf(single_tag, "<%s/>", tag); if (match_tag(buf, single_tag)) { result = true; return true; } sprintf(start_tag, "<%s>", tag); if (parse_int(buf, start_tag, x)) { result = (x != 0); return true; } return false; } // parse a string of the form ...string...; // returns the "string" part. // Does XML unescaping (replace < with <) // "string" may not include '<' // Strips white space from ends. // Use "", if there might be attributes // bool parse_str(const char* buf, const char* tag, char* dest, int destlen) { string str; const char* p; char tempbuf[1024]; int len; p = strstr(buf, tag); if (!p) return false; p = strchr(p, '>'); p++; const char* q = strchr(p, '<'); if (!q) return false; len = (int)(q-p); if (len >= destlen) len = destlen-1; memcpy(tempbuf, p, len); tempbuf[len] = 0; strip_whitespace(tempbuf); xml_unescape(tempbuf, dest); return true; } bool parse_str(const char* buf, const char* tag, string& dest) { char tempbuf[1024]; if (!parse_str(buf, tag, tempbuf, 1024)) return false; dest = tempbuf; return true; } // parse a string of the form 'xxx name="value" xxx'; // returns value in dest // void parse_attr(const char* buf, const char* name, char* dest, int len) { const char* p; const char *q; strcpy(dest, ""); p = strstr(buf, name); if (!p) return; p = strchr(p, '"'); if (!p) return; q = strchr(p+1, '"'); if (!q) return; if (len > q-p) len = (int)(q-p); strlcpy(dest, p+1, len); } int copy_stream(FILE* in, FILE* out) { char buf[1024]; int n, m; while (1) { n = (int)fread(buf, 1, 1024, in); m = (int)fwrite(buf, 1, n, out); if (m != n) return ERR_FWRITE; if (n < 1024) break; } return 0; } // append to a malloc'd string // int strcatdup(char*& p, char* buf) { p = (char*)realloc(p, strlen(p) + strlen(buf)+1); if (!p) { return ERR_MALLOC; } strcat(p, buf); return 0; } // copy from a file to a malloc'd string until the end tag is reached // int dup_element_contents(FILE* in, const char* end_tag, char** pp) { char buf[256]; int retval; char* p = strdup(""); while (fgets(buf, 256, in)) { if (strstr(buf, end_tag)) { *pp = p; return 0; } retval = strcatdup(p, buf); if (retval) return retval; } return ERR_XML_PARSE; } // copy from a file to static buffer // int copy_element_contents(FILE* in, const char* end_tag, char* p, int len) { char buf[256]; int n; strcpy(p, ""); while (fgets(buf, 256, in)) { if (strstr(buf, end_tag)) { return 0; } n = (int)strlen(buf); if (n >= len-1) return ERR_XML_PARSE; strcat(p, buf); len -= n; } return ERR_XML_PARSE; } int copy_element_contents(FILE* in, const char* end_tag, string& str) { char buf[256]; str = ""; while (fgets(buf, 256, in)) { if (strstr(buf, end_tag)) { return 0; } str += buf; } return ERR_XML_PARSE; } // replace XML element contents (element must be present) // void replace_element_contents( char* buf, const char* start, const char* end, const char* replacement ) { char temp[4096], *p, *q; p = strstr(buf, start); p += strlen(start); q = strstr(p, end); strlcpy(temp, q, sizeof(temp)); strcpy(p, replacement); strcat(p, temp); } // if the string contains a substring of the form X...Y, // remove the first such. bool remove_element(char* buf, const char* start, const char* end) { char* p, *q; p = strstr(buf, start); if (!p) return false; q = strstr(p+strlen(start), end); if (!q) return false; strcpy(p, q+strlen(end)); return true; } // replace a substring. Do at most one instance. // bool str_replace(char* str, const char* substr, const char* replacement) { char temp[4096], *p; p = strstr(str, substr); if (!p) return false; int n = (int)strlen(substr); strcpy(temp, p+n); strcpy(p, replacement); strcat(p, temp); return true; } // if the given XML has an element of the form // // ... // // then return the contents of that element. // Otherwise strip out all elements // void extract_venue(const char* in, const char* venue_name, char* out) { const char* p, *q; char* wp; char buf[256]; sprintf(buf, "", venue_name); p = strstr(in, buf); if (p) { // prefs contain the specified venue // p += strlen(buf); strcpy(out, p); wp = strstr(out, ""); if (!q) break; q += strlen(""); } } } // copy a line from the given string. // kinda like fgets() when you're reading from a string // char* sgets(char* buf, int len, char*& in) { char* p; p = strstr(in, "\n"); if (!p) return NULL; *p = 0; strlcpy(buf, in, len); *p = '\n'; in = p+1; return buf; } // NOTE: these used to take std::string instead of char* args. // But this performed poorly. // // NOTE: output buffer should be 6X size of input // void xml_escape(const char* in, char* out) { char buf[256], *p; p = out; for (; *in; in++) { int x = (int) *in; x &= 0xff; // just in case if (x == '<') { strcpy(p, "<"); p += 4; } else if (x == '&') { strcpy(p, "&"); p += 5; } else if (x>127) { sprintf(buf, "&#%d;", x); strcpy(p, buf); p += strlen(buf); } else if (x<32) { switch(x) { case 9: case 10: case 13: sprintf(buf, "&#%d;", x); strcpy(p, buf); p += strlen(buf); break; } } else { *p++ = x; } } *p = 0; } // output buffer need not be larger than input // void xml_unescape(const char* in, char* out) { char* p = out; while (*in) { if (!strncmp(in, "<", 4)) { *p++ = '<'; in += 4; } else if (!strncmp(in, "&", 5)) { *p++ = '&'; in += 5; } else if (!strncmp(in, "&#", 2)) { in += 2; char c = atoi(in); *p++ = c; in = strchr(in, ';'); if (in) in++; } else { *p++ = *in++; } } *p = 0; } // we got an unrecognized line. // If it has two <'s (e.g. xx) return 0. // If it's of the form return 0. // If it's of the form then scan for and return 0. // Otherwise return ERR_XML_PARSE // int skip_unrecognized(char* buf, MIOFILE& fin) { char* p, *q, buf2[256]; std::string close_tag; p = strchr(buf, '<'); if (!p) { return ERR_XML_PARSE; } if (strchr(p+1, '<')) { return 0; } q = strchr(p+1, '>'); if (!q) { return ERR_XML_PARSE; } if (q[-1] == '/') return 0; *q = 0; close_tag = string(""); while (fin.fgets(buf2, 256)) { if (strstr(buf2, close_tag.c_str())) { return 0; } } return ERR_XML_PARSE; } XML_PARSER::XML_PARSER(MIOFILE* _f) { f = _f; } // read until find non-whitespace char. // Return the char in the reference param // Return true iff reached EOF // bool XML_PARSER::scan_nonws(int& first_char) { int c; while (1) { c = f->_getc(); if (c == EOF) return true; if (isspace(c)) continue; first_char = c; return false; } } int XML_PARSER::scan_comment() { char buf[256]; char* p = buf; while (1) { int c = f->_getc(); if (c == EOF) return 2; *p++ = c; *p = 0; if (strstr(buf, "-->")) { return 1; } if (strlen(buf) > 32) { strcpy(buf, buf+16); p = buf; } } } // we just read a <; read until we find a >, // and copy intervening text to buf. // Return: // 0 if got a tag // 1 if got a comment (ignore) // 2 if reached EOF // TODO: parse attributes too // int XML_PARSER::scan_tag(char* buf, int len) { int c; char* buf_start = buf; for (int i=0; ; i++) { c = f->_getc(); if (c == EOF) return 2; if (c == '>') { *buf = 0; return 0; } if (--len > 0) { *buf++ = c; } // check for comment start // if (i==2 && !strncmp(buf_start, "!--", 3)) { return scan_comment(); } } } // read and copy text to buf; stop when find a <; // ungetc() that so we read it again // Return true iff reached EOF // bool XML_PARSER::copy_until_tag(char* buf, int len) { int c; while (1) { c = f->_getc(); if (c == EOF) return true; if (c == '<') { f->_ungetc(c); *buf = 0; return false; } if (--len > 0) { *buf++ = c; } } } // Scan something, either tag or text. // Strip whitespace at start and end. // Return true iff reached EOF // bool XML_PARSER::get(char* buf, int len, bool& is_tag) { bool eof; int c; while (1) { eof = scan_nonws(c); if (eof) return true; if (c == '<') { int retval = scan_tag(buf, len); if (retval == 2) return true; if (retval == 1) continue; is_tag = true; } else { buf[0] = c; eof = copy_until_tag(buf+1, len-1); if (eof) return true; is_tag = false; } strip_whitespace(buf); return false; } } // We just parsed "parsed_tag". // If it matches "start_tag", and is followed by a string // and by the matching close tag, return the string in "buf", // and return true. // bool XML_PARSER::parse_str( char* parsed_tag, const char* start_tag, char* buf, int len ) { bool is_tag, eof; char end_tag[256], tag[256], tmp[64000]; // handle the archaic form , which means empty string // strcpy(tag, start_tag); strcat(tag, "/"); if (!strcmp(parsed_tag, tag)) { strcpy(buf, ""); return true; } // check for start tag // if (strcmp(parsed_tag, start_tag)) return false; end_tag[0] = '/'; strcpy(end_tag+1, start_tag); // get text after start tag // eof = get(tmp, 64000, is_tag); if (eof) return false; // if it's the end tag, return empty string // if (is_tag) { if (strcmp(tmp, end_tag)) { return false; } else { strcpy(buf, ""); return true; } } eof = get(tag, sizeof(tag), is_tag); if (eof) return false; if (!is_tag) return false; if (strcmp(tag, end_tag)) return false; strlcpy(buf, tmp, len); return true; } bool XML_PARSER::parse_string( char* parsed_tag, const char* start_tag, string& str ) { char buf[8192]; bool flag = parse_str(parsed_tag, start_tag, buf, sizeof(buf)); if (!flag) return false; str = buf; return true; } // Same, for integers // bool XML_PARSER::parse_int(char* parsed_tag, const char* start_tag, int& i) { char buf[256], *end; bool is_tag, eof; char end_tag[256], tag[256]; if (strcmp(parsed_tag, start_tag)) return false; end_tag[0] = '/'; strcpy(end_tag+1, start_tag); eof = get(buf, sizeof(buf), is_tag); if (eof) return false; if (is_tag) { if (!strcmp(buf, end_tag)) { i = 0; // treat as 0 return true; } else { return false; } } i = strtol(buf, &end, 0); if (end != buf+strlen(buf)) return false; eof = get(tag, sizeof(tag), is_tag); if (eof) return false; if (!is_tag) return false; if (strcmp(tag, end_tag)) return false; return true; } // Same, for doubles // bool XML_PARSER::parse_double(char* parsed_tag, const char* start_tag, double& x) { char buf[256], *end; bool is_tag, eof; char end_tag[256], tag[256]; if (strcmp(parsed_tag, start_tag)) return false; end_tag[0] = '/'; strcpy(end_tag+1, start_tag); eof = get(buf, sizeof(buf), is_tag); if (eof) return false; if (is_tag) { if (!strcmp(buf, end_tag)) { x = 0; // treat as 0 return true; } else { return false; } } x = strtod(buf, &end); if (end != buf+strlen(buf)) return false; eof = get(tag, sizeof(tag), is_tag); if (eof) return false; if (!is_tag) return false; if (strcmp(tag, end_tag)) return false; return true; } // Same, for bools // bool XML_PARSER::parse_bool(char* parsed_tag, const char* start_tag, bool& b) { char buf[256], *end; bool is_tag, eof; char end_tag[256], tag[256]; // handle the archaic form , which means true // strcpy(tag, start_tag); strcat(tag, "/"); if (!strcmp(parsed_tag, tag)) { b = true; return true; } // otherwise look for something of the form int // if (strcmp(parsed_tag, start_tag)) return false; eof = get(buf, sizeof(buf), is_tag); if (eof) return false; if (is_tag) return false; b = (strtol(buf, &end, 0) != 0); if (end != buf+strlen(buf)) return false; end_tag[0] = '/'; strcpy(end_tag+1, start_tag); eof = get(tag, sizeof(tag), is_tag); if (eof) return false; if (!is_tag) return false; if (strcmp(tag, end_tag)) return false; return true; } // parse a start tag (optionally preceded by ) // bool XML_PARSER::parse_start(const char* start_tag) { char tag[256]; bool eof, is_tag; eof = get(tag, sizeof(tag), is_tag); if (eof || !is_tag ) { return false; } if (strstr(tag, "?xml")) { eof = get(tag, sizeof(tag), is_tag); if (eof || !is_tag ) { return false; } } if (strcmp(tag, start_tag)) { return false; } return true; } // copy everything up to (but not including) the given end tag. // The copied text may include XML tags. // strips whitespace. // int XML_PARSER::element_contents(const char* end_tag, char* buf, int buflen) { int n=0; int retval=0; while (1) { if (n == buflen-1) { retval = ERR_XML_PARSE; break; } int c = f->_getc(); if (c == EOF) { retval = ERR_XML_PARSE; break; } buf[n++] = c; buf[n] = 0; char* p = strstr(buf, end_tag); if (p) { *p = 0; break; } } buf[n] = 0; strip_whitespace(buf); return retval; } // We got an unexpected tag. // If it's an end tag, do nothing. // Otherwise skip until the end tag, if any // void XML_PARSER::skip_unexpected( const char* start_tag, bool verbose, const char* where ) { char tag[256], end_tag[256]; bool is_tag; if (verbose) { fprintf(stderr, "Unrecognized XML in %s: %s\n", where, start_tag); } if (strchr(start_tag, '/')) return; sprintf(end_tag, "/%s", start_tag); while (!get(tag, sizeof(tag), is_tag)) { if (verbose) { fprintf(stderr, "Skipping: %s\n", tag); } if (!is_tag) continue; if (!strcmp(tag, end_tag)) return; skip_unexpected(tag, verbose, where); } } // sample use is shown below #if 0 void parse(FILE* f) { char tag[256]; bool is_tag, flag; MIOFILE mf; XML_PARSER xp(&mf); char name[256]; int val; double x; mf.init_file(f); if (!xp.parse_start("blah")) { printf("missing start tag\n"); return; } while (!xp.get(tag, sizeof(tag), is_tag)) { if (!is_tag) { printf("unexpected text: %s\n", tag); continue; } if (!strcmp(tag, "/blah")) { printf("success\n"); return; } else if (xp.parse_str(tag, "str", name, sizeof(name))) { printf("got str: %s\n", name); } else if (xp.parse_int(tag, "int", val)) { printf("got int: %d\n", val); } else if (xp.parse_double(tag, "double", x)) { printf("got double: %f\n", x); } else if (xp.parse_bool(tag, "bool", flag)) { printf("got bool: %d\n", flag); } else { printf("unparsed tag: %s\n", tag); xp.skip_unexpected(tag); } } printf("unexpected EOF\n"); } int main() { FILE* f = fopen("foo.xml", "r"); parse(f); } ... and run it against, e.g.: asdlfkj fj blah 6 6.555 0 #endif const char *BOINC_RCSID_3f3de9eb18 = "$Id$";