escaping.cc

Go to the documentation of this file.
00001 
00002 // -*- mode: c++; c-basic-offset:4 -*-
00003 
00004 // This file is part of libdap, A C++ implementation of the OPeNDAP Data
00005 // Access Protocol.
00006 
00007 // Copyright (c) 2002,2003 OPeNDAP, Inc.
00008 // Author: James Gallagher <jgallagher@opendap.org>
00009 //
00010 // This library is free software; you can redistribute it and/or
00011 // modify it under the terms of the GNU Lesser General Public
00012 // License as published by the Free Software Foundation; either
00013 // version 2.1 of the License, or (at your option) any later version.
00014 //
00015 // This library is distributed in the hope that it will be useful,
00016 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00023 //
00024 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
00025 
00026 // Copyright (c) 1996, California Institute of Technology.
00027 // ALL RIGHTS RESERVED.   U.S. Government Sponsorship acknowledged.
00028 //
00029 // Please read the full copyright notice in the file COPYRIGHT_URI
00030 // in this directory.
00031 //
00032 // Author: Todd Karakashian, NASA/Jet Propulsion Laboratory
00033 //         Todd.K.Karakashian@jpl.nasa.gov
00034 //
00035 // $RCSfile: escaping.cc,v $ - Miscellaneous routines for OPeNDAP HDF server
00036 //
00037 // These two routines are for escaping/unescaping strings that are identifiers
00038 // in DAP2
00039 // id2www() -- escape (using WWW hex codes) non-allowable characters in a
00040 // DAP2 identifier
00041 // www2id() -- given an WWW hexcode escaped identifier, restore it
00042 //
00043 // These two routines are for escaping/unescaping strings storing attribute
00044 // values.  They use traditional octal escapes (\nnn) because they are
00045 // intended to be viewed by a user
00046 // escattr() -- escape (using traditional octal backslash) non-allowable
00047 // characters in the value of a DAP2 attribute
00048 // unescattr() -- given an octally escaped string, restore it
00049 //
00050 // These are routines used by the above, not intended to be called directly:
00051 //
00052 // hexstring()
00053 // unhexstring()
00054 // octstring()
00055 // unoctstring()
00056 //
00057 // -Todd
00058 
00059 #include <ctype.h>
00060 
00061 #include <iomanip>
00062 #include <string>
00063 #include <sstream>
00064 
00065 #include "GNURegex.h"
00066 #include "Error.h"
00067 #include "InternalErr.h"
00068 // #define DODS_DEBUG
00069 #include "debug.h"
00070 
00071 using namespace std;
00072 
00073 // The next four functions were originally defined static, but I removed that
00074 // to make testing them (see generalUtilTest.cc) easier to write. 5/7/2001
00075 // jhrg
00076 
00077 string
00078 hexstring(unsigned char val)
00079 {
00080     ostringstream buf;
00081     buf << hex << setw(2) << setfill('0')
00082     << static_cast<unsigned int>(val);
00083 
00084     return buf.str();
00085 }
00086 
00087 string
00088 unhexstring(string s)
00089 {
00090     int val;
00091     istringstream ss(s);
00092     ss >> hex >> val;
00093     char tmp_str[2];
00094     tmp_str[0] = static_cast<char>(val);
00095     tmp_str[1] = '\0';
00096     return string(tmp_str);
00097 }
00098 
00099 string
00100 octstring(unsigned char val)
00101 {
00102     ostringstream buf;
00103     buf << oct << setw(3) << setfill('0')
00104     << static_cast<unsigned int>(val);
00105 
00106     return buf.str();
00107 }
00108 
00109 string
00110 unoctstring(string s)
00111 {
00112     int val;
00113 
00114     istringstream ss(s);
00115     ss >> oct >> val;
00116 
00117     DBG(cerr << "unoctstring: " << val << endl);
00118 
00119     char tmp_str[2];
00120     tmp_str[0] = static_cast<char>(val);
00121     tmp_str[1] = '\0';
00122     return string(tmp_str);
00123 }
00124 
00149 string
00150 id2www(string in, const string &allowable)
00151 {
00152     string::size_type i = 0;
00153 
00154     while ((i = in.find_first_not_of(allowable, i)) != string::npos) {
00155         in.replace(i, 1, "%" + hexstring(in[i]));
00156         i++;
00157     }
00158 
00159     return in;
00160 }
00161 
00172 string
00173 id2www_ce(string in, const string &allowable)
00174 {
00175     return id2www(in, allowable);
00176 }
00177 
00206 string
00207 www2id(const string &in, const string &escape, const string &except)
00208 {
00209     string::size_type i = 0;
00210     string res = in;
00211     while ((i = res.find_first_of(escape, i)) != string::npos) {
00212         if (res.substr(i, 3) == except) {
00213             i += 3;
00214             continue;
00215         }
00216         res.replace(i, 3, unhexstring(res.substr(i + 1, 2)));
00217     }
00218 
00219     return res;
00220 }
00221 
00222 static string
00223 entity(char c)
00224 {
00225     switch (c) {
00226     case '>': return "&gt;";
00227     case '<': return "&lt;";
00228     case '&': return "&amp;";
00229     case '\'': return "&apos;";
00230     case '\"': return "&quot;";
00231     default:
00232         throw InternalErr(__FILE__, __LINE__, "Unrecognized character.");
00233     }
00234 }
00235 
00242 string
00243 id2xml(string in, const string &not_allowed)
00244 {
00245     string::size_type i = 0;
00246 
00247     while ((i = in.find_first_of(not_allowed, i)) != string::npos) {
00248         in.replace(i, 1, entity(in[i]));
00249         i++;
00250     }
00251 
00252     return in;
00253 }
00254 
00260 string
00261 xml2id(string in)
00262 {
00263     string::size_type i = 0;
00264 
00265     while ((i = in.find("&gt;", i)) != string::npos)
00266         in.replace(i, 4, ">");
00267 
00268     i = 0;
00269     while ((i = in.find("&lt;", i)) != string::npos)
00270         in.replace(i, 4, "<");
00271 
00272     i = 0;
00273     while ((i = in.find("&amp;", i)) != string::npos)
00274         in.replace(i, 5, "&");
00275 
00276     i = 0;
00277     while ((i = in.find("&apos;", i)) != string::npos)
00278         in.replace(i, 6, "'");
00279 
00280     i = 0;
00281     while ((i = in.find("&quot;", i)) != string::npos)
00282         in.replace(i, 6, "\"");
00283 
00284     return in;
00285 }
00286 
00292 string
00293 esc2underscore(string s)
00294 {
00295     string::size_type pos;
00296     while ((pos = s.find('%')) != string::npos)
00297         s.replace(pos, 3, "_");
00298 
00299     return s;
00300 }
00301 
00302 
00306 string
00307 escattr(string s)
00308 {
00309     const string printable = " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789~`!@#$%^&*()_-+={[}]|\\:;<,>.?/'\"";
00310     const string ESC = "\\";
00311     const string DOUBLE_ESC = ESC + ESC;
00312     const string QUOTE = "\"";
00313     const string ESCQUOTE = ESC + QUOTE;
00314 
00315     // escape non-printing characters with octal escape
00316     string::size_type ind = 0;
00317     while ((ind = s.find_first_not_of(printable, ind)) != s.npos)
00318         s.replace(ind, 1, ESC + octstring(s[ind]));
00319 
00320     // escape \ with a second backslash
00321     ind = 0;
00322     while ((ind = s.find(ESC, ind)) != s.npos) {
00323         s.replace(ind, 1, DOUBLE_ESC);
00324         ind += DOUBLE_ESC.length();
00325     }
00326 
00327     // escape " with backslash
00328     ind = 0;
00329     while ((ind = s.find(QUOTE, ind)) != s.npos) {
00330         s.replace(ind, 1, ESCQUOTE);
00331         ind += ESCQUOTE.length();
00332     }
00333 
00334     return s;
00335 }
00336 
00345 string
00346 unescattr(string s)
00347 {
00348     Regex octal("\\\\[0-3][0-7][0-7]");  // matches 4 characters
00349     Regex esc_quote("\\\\\"");  // matches 3 characters
00350     Regex esc_esc("\\\\\\\\");      // matches 2 characters
00351     const string ESC = "\\";
00352     const string QUOTE = "\"";
00353     int matchlen;
00354     unsigned int index;
00355 
00356     DBG(cerr << "0XX" << s << "XXX" << endl);
00357     // unescape any escaped backslashes
00358     index = 0;
00359     index = esc_esc.search(s.c_str(), s.length(), matchlen, 0);
00360     while (index < s.length()) {
00361         DBG(cerr << "1aXX" << s << "XXX index: " << index << endl);
00362         s.replace(index, 2, ESC);
00363         DBG(cerr << "1bXX" << s << "XXX index: " << index << endl);
00364         index = esc_esc.search(s.c_str(), s.length(), matchlen, 0);
00365     }
00366 
00367     // unescape any escaped double quote characters
00368     index = esc_quote.search(s.c_str(), s.length(), matchlen, 0);
00369     while (index < s.length()) {
00370         s.replace(index, 2, QUOTE);
00371         DBG(cerr << "2XX" << s << "XXX index: " << index << endl);
00372         index = esc_quote.search(s.c_str(), s.length(), matchlen, 0);
00373     }
00374 
00375     // unescape octal characters
00376     index = octal.search(s.c_str(), s.length(), matchlen, 0);
00377     while (index < s.length()) {
00378         s.replace(index, 4, unoctstring(s.substr(index + 1, 3)));
00379         DBG(cerr << "3XX" << s << "XXX index: " << index << endl);
00380         index = octal.search(s.c_str(), s.length(), matchlen, 0);
00381     }
00382 
00383     DBG(cerr << "4XX" << s << "XXX" << endl);
00384     return s;
00385 }
00386 
00387 string
00388 munge_error_message(string msg)
00389 {
00390     // First, add enclosing quotes if needed.
00391     if (*msg.begin() != '"')
00392         msg.insert(msg.begin(), '"');
00393     if (*(msg.end() - 1) != '"')
00394         msg += "\"";
00395 
00396     // Now escape any internal double quotes that aren't escaped.
00397     string::iterator miter;
00398     for (miter = msg.begin() + 1; miter != msg.end() - 1; miter++)
00399         if (*miter == '"' && *(miter - 1) != '\\')
00400             miter = msg.insert(miter, '\\');
00401 
00402     return msg;
00403 }
00404 

Generated on Wed Jun 27 12:56:39 2007 for libdap++ by  doxygen 1.4.7