Simple SOAP: SOAPParser.cpp Source File

00001 
00010 // SOAPParser.cpp: implementation of the SOAPParser class.
00011 //
00013 
00014 /*
00015         History:
00016 
00017         24-Aug-04       G. Cerchio      work around isspace assert with debug lib
00018         21-Aug-04       G. Cerchio      output the input to the debug window
00019         28-Mar-06       G. Cerchio      added comment parsing on xml stream
00020 */
00021 
00022 #ifdef _MSC_VER 
00023 // Microsoft only extension to the compiler
00024 // Turns off noise about debug symbols being "too long".
00025 #pragma warning( disable : 4786 )
00026 #endif // _MSC_VER 
00027 
00028 #ifndef _STACK_
00029     #include <stack>
00030 #endif // _STACK_
00031 
00032 #if !defined(SOAPELEMENT_H)
00033     #include "SOAPElement.h"
00034 #endif // !defined(SOAPELEMENT_H)
00035 #include <iostream>
00036 #ifndef _SSTREAM_
00037     #include <sstream>
00038 #endif // _SSTREAM_
00039 
00040 #include "SOAPParser.h"
00041 
00042 
00044 // Construction/Destruction
00046 
00047 #ifdef _MSC_VER
00048 // the MSVC debug isspace asserts on UTF8 on windows
00049 #undef isspace
00050 static int isspace( int c )
00051 {
00052  // space, tab, carriage-return, newline, vertical tab, and form-feed
00053         if ( c == ' ' )
00054                 return true;
00055         if ( c == '\n' )
00056                 return true;
00057         if ( c == '\r' )
00058                 return true;
00059         if ( c == '\t' )
00060                 return true;
00061         if ( c == '\f' )
00062                 return true;
00063         if ( c == 0xb )
00064                 return true;
00065         
00066         return false;
00067 }
00068 #endif 
00069 
00070 // Constants
00071 const std::string g_KszEnvelope = "Envelope";
00072 const std::string g_KszHeader = "Header";
00073 const std::string g_KszBody = "Body";
00074 const std::string g_KszFault = "Fault";
00075 const std::string g_KszQuoteTypes = "\"'";
00076 SOAPParser::SOAPParser()
00077 {
00078 
00079 }
00080 
00081 SOAPParser::~SOAPParser()
00082 {
00083 
00084 }
00085 
00086 bool SOAPParser::parseMessage( const std::string& szMessage, 
00087     SOAPElement& soapMessage )
00088 {
00089         long nCurrentPos = 0;
00090     m_namespaceMap.clear();
00091     return parseMessage( szMessage, soapMessage, nCurrentPos );
00092 }
00093 
00094 
00095 bool SOAPParser::parseMessage( const std::string& szMessage, 
00096     SOAPElement& soapElement, long& nCurrentPos )
00097 {
00098     bool retval = true;
00099     std::string szEndTag;
00100 
00101     // Start looking for the start-tag.
00102     std::string szCurrentstring;
00103     const long KnLength = szMessage.length();
00104 
00105     // Keep marching in the string until we hit the first '<'
00106     for ( ; ( szMessage[nCurrentPos] != '<' ) 
00107          && ( nCurrentPos < KnLength ); ++nCurrentPos )
00108     {
00109         // body intentionally left empty.
00110     }
00111 
00112         // skip past a comment - GJPC
00113         if ( ( szMessage[ nCurrentPos + 1 ] == '!' ) &&
00114                  ( szMessage[ nCurrentPos + 2 ] == '-' ) &&
00115                  ( szMessage[ nCurrentPos + 3 ] == '-' ) )
00116         {
00117                 for ( ; ( szMessage[nCurrentPos]   != '-' )  && 
00118                                 ( szMessage[nCurrentPos+1] != '-' ) && 
00119                                 ( szMessage[nCurrentPos+2] != '>' ) &&
00120                                 ( nCurrentPos < KnLength+2 ); ++nCurrentPos )
00121                 {
00122                         // body intentionally left empty.
00123                 }
00124                 if ( nCurrentPos < KnLength )
00125                         for ( ; ( szMessage[nCurrentPos] != '<' ) 
00126                                 && ( nCurrentPos < KnLength ); ++nCurrentPos )
00127                         {
00128                                 // body intentionally left empty.
00129                         }
00130         }
00131     // If we got to the end without finding the
00132     // end tag (or any begin tag), return false.
00133     if ( nCurrentPos == KnLength )
00134     {
00135         setFailed();
00136         std::ostringstream szStream;
00137         getFault()->setSpecificFault( "ImproperlyFormattedMessage" );
00138         getFault()->faultString() = "The message either has no begin "
00139             "tag or is missing the end tag.  In either case, "
00140             "the message was not formatted correctly.";
00141         return false;
00142     }
00143 
00144     std::string szBeginTag;
00145     std::string szNamespace;
00146     std::string szAccessorName;
00147     std::string szFullAccessorName;
00148     bool bIsEmptyTag = false;
00149     bool bIsEndTag = false;
00150     for ( ; retval && (nCurrentPos < KnLength); ++nCurrentPos )
00151     {
00152         // Initialize all variables.
00153         bIsEmptyTag = false;
00154         bIsEndTag = false;
00155         szBeginTag = "";
00156         szNamespace = "";
00157         szAccessorName = "";
00158         szFullAccessorName = "";
00159         long nInitialBeginPos = nCurrentPos;
00160 
00161         // Get the information contained by the next pair
00162         // of "<>".
00163         retval = extractBeginTag( szBeginTag, szNamespace, 
00164             szAccessorName, szMessage, nCurrentPos, bIsEmptyTag, 
00165             bIsEndTag );
00166 
00167         // If we are expecting an end tag, but this isn't
00168         // one, it must be another child item.  Start
00169         // parsing at this location and attach a new "tree".
00170         if ( ( szEndTag.length() > 0 ) && (!bIsEndTag ) )
00171         {
00172             // extract the contents of this item independently
00173             nCurrentPos = nInitialBeginPos;
00174             SOAPElement* pElement = new SOAPElement;
00175             soapElement.addElement( pElement );
00176                         
00177             retval = parseMessage( szMessage, *pElement, nCurrentPos );
00178 
00179                         //If the current pos is on the tag, back up one place (for loop will bring it correctly)
00180                         if ( (retval) && (szMessage[nCurrentPos] == '<') && (nCurrentPos) )
00181             {
00182                                 nCurrentPos--;
00183             }
00184             // We've now parsed all the data within the tree.
00185             // Go to the top of the loop again.  If retval is
00186             // false, the error data should be filled in
00187             // and we will drop out of the loop.
00188             continue;
00189         }
00190 
00191         if ( !retval )
00192         {
00193             // Unwind the stack-- we have an error so the rest of the
00194             // doc doesn't matter.
00195             setFailed();
00196             std::ostringstream szStream;
00197             szStream << "Failed near position " << nCurrentPos << 
00198                 " within the message.";
00199             getFault()->setSpecificFault( "ImproperlyFormattedMessage" );
00200             getFault()->faultString() = szStream.str();
00201             break;
00202         }
00203 
00204         // If we are at the end of this tag, prepare to close
00205         // out the current SOAPElement.
00206         if ( bIsEndTag )
00207         {
00208             if ( szEndTag.empty() )
00209             {
00210                 // We got an end tag when we weren't expecting one.
00211                 setFailed();
00212                 std::ostringstream szStream;
00213                 szStream << "Failed near position " << nCurrentPos << 
00214                     " within the message." << "Expected end tag: " <<
00215                     szEndTag << std::ends;
00216                 getFault()->setSpecificFault( "EmptyEndTag" );
00217                 getFault()->faultString() = szStream.str();
00218                 retval = false;
00219                 break;
00220             }
00221             if ( szBeginTag != szEndTag )
00222             {
00223                 // This isn't the end tag we were expecting.
00224                 // Let the user know that the doc isn't formatted
00225                 // correctly.
00226                 setFailed();
00227                 std::ostringstream szStream;
00228                 szStream << "Failed near position " << nCurrentPos <<
00229                     " within the message." << "Expected end tag: " <<
00230                     szEndTag;
00231                 getFault()->setSpecificFault( "WrongEndTag" );
00232                 getFault()->faultString() = szStream.str();
00233                 retval = false;
00234                 break;
00235             }
00236             
00237             retval = true;
00238 
00239             // This was the expected end tag.  We are done
00240             // with parsing for this element.
00241             break;
00242         }
00243 
00244         // Pull the namespaces out of the tag.  They
00245         // look like attributes to the rest of the code.
00246         retval = extractNamespaces( szBeginTag );
00247 
00248         if ( !retval )
00249         {
00250             // Gack, something went wrong.
00251             setFailed();
00252             std::ostringstream szStream;
00253             szStream << "Failed near position " << nCurrentPos <<
00254                 " within the message.";
00255             getFault()->setSpecificFault( "NamespaceExtractionFailed",
00256                 SOAPFault::Server );
00257             getFault()->faultString() = szStream.str();
00258             retval = false;
00259             break;
00260         }
00261 
00262         // If the given element was specified using a namespace,
00263         // tack it on to the start of the namespace name.
00264         if ( szNamespace.length() > 0 )
00265         {
00266             szFullAccessorName = szNamespace + std::string(":");
00267             soapElement.namespaceName() = szNamespace;
00268         }
00269         szFullAccessorName += szAccessorName;
00270         soapElement.accessorName() = szAccessorName;
00271 
00272         // If this is the envelope, check the version.
00273         if ( soapElement.accessorName() == "Envelope" )
00274         {
00275             bool bFound = false;
00276             // Check to see if the SOAP namespace was set.
00277             // If not, we have a VersionMismatch problem.
00278             for ( XMLNStoURN::iterator it = m_namespaceMap.begin();
00279                 it != m_namespaceMap.end(); ++it )
00280             {
00281                 if ( "http://schemas.xmlsoap.org/soap/envelope/" == 
00282                     it->second)
00283                 {
00284                     bFound = true;
00285                     break;
00286                 }
00287             }
00288             if ( !bFound )
00289             {
00290                 setFailed();
00291                 getFault()->setFaultCode( SOAPFault::VersionMismatch );
00292                 getFault()->faultString() = "Version mismatch found.";
00293                 retval = false;
00294                 break;
00295             }
00296         }
00297 
00298         // set the text for the expected end tag.
00299         szEndTag = std::string( "/" );
00300         if ( szNamespace.length() > 0 )
00301         {
00302             szEndTag += szNamespace + std::string(":");
00303         }
00304         szEndTag += soapElement.accessorName();
00305 
00306         // Pull the attribute information out of the begin tag.
00307         retval = extractAttributes( soapElement, szBeginTag );
00308         if ( !retval )
00309         {
00310             setFailed();
00311             std::ostringstream szStream;
00312             szStream << "Attribute extraction: Failed near position " << 
00313                 nCurrentPos << " within the message." << std::ends;
00314             getFault()->setSpecificFault( "ImproperlyFormattedMessage" );
00315             getFault()->faultString() = szStream.str();
00316             break;
00317         }
00318 
00319                 // if this is an empty tag we just move along GJPC
00320                 if ( bIsEmptyTag )
00321                         break;
00322 
00323         // Take out the value contained between the begin and end tags.
00324                 int tmpPos = nCurrentPos;
00325                 //nCurrentPos = tmpPos;
00326         retval = extractValue( soapElement, szMessage, nCurrentPos );
00327         if ( !retval )
00328         {
00329             setFailed();
00330             std::ostringstream szStream;
00331             szStream << "Value extraction: Failed near position " << 
00332                 nCurrentPos << " within the message." << std::ends;
00333             getFault()->setSpecificFault( "ImproperlyFormattedMessage" );
00334             getFault()->faultString() = szStream.str();
00335             break;
00336         }
00337         else 
00338         {
00339                         //Change on 07/03/2001 by SM
00340                         //If the current pos is on the tag, back up one place (for loop will bring it correctly)
00341                         if ( (szMessage[nCurrentPos] == '<') && (nCurrentPos) )
00342             {
00343                                 nCurrentPos--;
00344             }
00345                 }
00346     }
00347     return retval;
00348 }
00349 
00350 bool SOAPParser::extractBeginTag(
00351     std::string& szBeginTag, 
00352     std::string& szNamespace, 
00353     std::string& szAccessorName, 
00354     std::string szMessage, 
00355     long &nCurrentPos, 
00356     bool &bIsEmptyTag, 
00357     bool &bIsEndTag)
00358 {
00359     bool retval = true;
00360     bool bExtractedAccessorName = false;
00361     bool bIsSpace = false;
00362     std::string szFullAccessor;
00363     const long KnLength = szMessage.length();    
00364     long nBeginEndTagPos = nCurrentPos;
00365     bIsEmptyTag = false;
00366     bIsEndTag = false;
00367     szBeginTag = "";
00368     szNamespace = "";
00369     szAccessorName = "";
00370 
00371     for( ; ( nCurrentPos < KnLength ) && ( '>' != szMessage[nCurrentPos] ); ++nCurrentPos )
00372     {
00373         // Check if this is an empty tag
00374         if ( ( '/' == szMessage[nCurrentPos] ) && 
00375              ( ( nCurrentPos + 1 ) < KnLength ) &&
00376              ( '>' == szMessage[nCurrentPos + 1] ) )
00377         {
00378             bIsEmptyTag = true;
00379             nCurrentPos += 2;
00380             break;
00381         }
00382 
00383         // Check if this is an end tag
00384         if ( ( '/' == szMessage[nCurrentPos] ) && 
00385              ( ( nCurrentPos - 1 ) >= 0 ) &&
00386              ( '<' == szMessage[nCurrentPos - 1] ) )
00387         {
00388             bIsEndTag = true;
00389             long nEndOfTag = szMessage.find( ">", nCurrentPos );
00390             szBeginTag = szMessage.substr( nCurrentPos, nEndOfTag - nCurrentPos );
00391             nCurrentPos = nEndOfTag;
00392             break;
00393         }
00394 
00395         // Check if we've extracted the accessor name yet.
00396         if ( bExtractedAccessorName )
00397         {
00398             szBeginTag += szMessage[nCurrentPos];
00399         }
00400         else
00401         {
00402             if ( isspace( szMessage[nCurrentPos] ) )
00403             {
00404                 if ( szFullAccessor.length() > 0 )
00405                 {
00406                     bExtractedAccessorName = true;
00407                 }
00408             }
00409             else if ( ( szFullAccessor.length() > 0 ) || 
00410                      ( isalnum( szMessage[nCurrentPos] ) ) )
00411             {
00412                 szFullAccessor += szMessage[nCurrentPos];
00413             }
00414         }
00415     }
00416 
00417     // Split up the namespace if any and accessor name.
00418     splitNSAndAccessor( szFullAccessor, szNamespace, szAccessorName );
00419  
00420     if ( '>' == szMessage[nCurrentPos] )
00421     {
00422         // walk past the end of the element
00423         ++nCurrentPos;
00424     }
00425     return retval;
00426 }
00427 
00428 bool SOAPParser::extractNamespaces( std::string &szCompleteAccessor )
00429 {
00430     bool retval = true;
00431     
00432     // The idea here is to extract and remove the namespace declarations.
00433     const std::string KszXMLNS = "xmlns:";
00434     const long KnLenXMLNS = KszXMLNS.length();
00435     const long KnLength = szCompleteAccessor.length();
00436 
00437     std::string szTempAccessor;
00438     std::string szNamespaceName;
00439     std::string szNamespaceURI;
00440     char cQuoteChar = 0;
00441     long nEqualsPos = 0;
00442     long nQuotePos = 0;
00443     long nEndQuotePos = 0;
00444     bool bFoundANamespace = false;
00445     for ( long nPos = 0; nPos < KnLength; ++nPos )
00446     {
00447         nPos = szCompleteAccessor.find( KszXMLNS, nPos );
00448         if ( std::string::npos == nPos )
00449         {
00450             // No more occurences of the string exist after nPos
00451             break;
00452         }
00453         bFoundANamespace = true;
00454         szTempAccessor += szCompleteAccessor.substr( nEndQuotePos + 1, nPos - nEndQuotePos - 1 );
00455         // Capture the namespace name.
00456         nEqualsPos = szCompleteAccessor.find( std::string("="), nPos );
00457         szNamespaceName = szCompleteAccessor.substr( nPos + KnLenXMLNS, nEqualsPos - nPos - KnLenXMLNS );
00458         // Capture the namespace URI.
00459         // find out the enclosing quote type
00460         nPos = nEqualsPos;
00461         szNamespaceURI = extractQuotedString( szCompleteAccessor, nPos );
00462         nEndQuotePos = nPos;
00463         m_namespaceMap[szNamespaceName] = szNamespaceURI;
00464     }
00465         
00466     if ( (nEndQuotePos > 0 ) && (KnLength - nEndQuotePos - 1 > 0) ) 
00467     {
00468         szTempAccessor += szCompleteAccessor.substr( nEndQuotePos + 1, KnLength - nEndQuotePos - 1 );
00469     }
00470     // Replace the string with the one without namespace declarations.
00471     // We took those out so that they wouldn't look like 
00472     // attributes later.
00473     if ( bFoundANamespace )
00474     {
00475         szCompleteAccessor = szTempAccessor;
00476     }
00477     return retval;
00478 }
00479 
00480 bool SOAPParser::extractAttributes(SOAPElement &theElement, std::string szBeginTag)
00481 {
00482     bool retval = true;
00483     const long KnLength = szBeginTag.length();
00484     long nPos = 0;
00485     long nEqualsPos = 0;
00486     long nQuotePos = 0;
00487     long nEndQuotePos = 0;
00488     std::string szNamespace;
00489     // Message may have a bunch of leading whitespace.  Eat it up.
00490 
00491     for ( nPos = 0; ( nPos < KnLength ) && isspace( szBeginTag[nPos] ); ++nPos )
00492     {
00493         // just advances through the loop until it finds a non-whitespace
00494         // character.
00495     }
00496 
00497     for ( ; nPos < KnLength; ++nPos )
00498     {
00499         SOAPAttribute anAttribute;
00500         nEqualsPos = szBeginTag.find( std::string("="), nPos );
00501         if ( std::string::npos == nEqualsPos )
00502         {
00503             break;
00504         }
00505         szNamespace = szBeginTag.substr( nPos, nEqualsPos - nPos );
00506         splitNSAndAccessor( szNamespace, anAttribute.namespaceName(), anAttribute.accessor() );
00507         nPos = nEqualsPos;
00508         anAttribute.value() = extractQuotedString( szBeginTag, nPos );
00509         theElement.addAttribute( anAttribute );
00510     }
00511     return retval;
00512 }
00513 
00514 bool SOAPParser::extractValue(SOAPElement &theElement, const std::string& szMessage, long &nCurrentPos)
00515 {
00516     bool retval = true;
00517     const long KnLength = szMessage.length();    
00518     std::string szValue;
00519     bool bProcessingWhiteSpace = false;
00520     bool bIsSpace = false;
00521     for ( ; ( szMessage[nCurrentPos] != '<' ) && ( nCurrentPos < KnLength );
00522         ++nCurrentPos )
00523     {
00524         bIsSpace = ( 0 != isspace(szMessage[nCurrentPos]) );
00525         if ( !( bIsSpace && bProcessingWhiteSpace ))
00526         {
00527             bProcessingWhiteSpace = false;
00528             szValue += szMessage[nCurrentPos];
00529         }
00530         else if ( bIsSpace && !bProcessingWhiteSpace )
00531         {
00532             // Because we eat whitespace, any whitespace should initially
00533             // generate a space, then nothing until
00534             // the next non-whitespace character.
00535             szValue += ' ';
00536             bProcessingWhiteSpace = true;
00537         }
00538     }
00539     
00540     theElement.value() = szValue;
00541     if ( szMessage[nCurrentPos] == '<' )
00542     {
00543         long nLTPosition = nCurrentPos;
00544         // Figure out if this is a begin tag or an end tag
00545         bool bIsEndTag = false;
00546         for ( ; ( szMessage[nCurrentPos] != '/' ) && ( nCurrentPos < KnLength );
00547             ++nCurrentPos )
00548         {
00549             if ( isspace( szMessage[nCurrentPos] ) )
00550             {
00551                 continue;
00552             }
00553             if ( isalnum( szMessage[nCurrentPos] ) || (szMessage[nCurrentPos] == '_') )
00554             {
00555                 break;
00556             }
00557         }
00558         bIsEndTag = szMessage[nCurrentPos] == '/';
00559         if ( szMessage[nCurrentPos] != '/' )
00560         {
00561             SOAPElement* pElement = new SOAPElement();
00562             theElement.addElement( pElement );
00563             nCurrentPos = nLTPosition;
00564             retval = parseMessage( szMessage, *pElement, nCurrentPos );
00565         }
00566         else
00567         {
00568             nCurrentPos = nLTPosition;
00569         }
00570     }
00571     
00572     return retval;
00573 }
00574 
00575 void SOAPParser::splitNSAndAccessor(std::string szFullString, std::string &szNamespace, std::string &szOther)
00576 {
00577     const long KnNameLength = szFullString.length();
00578     long nColonPos = szFullString.find_first_of( std::string(":") );
00579     bool bHasNameSpace = nColonPos != std::string::npos;
00580     if ( bHasNameSpace )
00581     {
00582         szNamespace = szFullString.substr( 0, nColonPos );
00583         szOther = szFullString.substr( nColonPos + 1, KnNameLength );
00584     }
00585     else
00586     {
00587         szOther = szFullString;
00588     }
00589 }
00590 
00591 
00592 std::string SOAPParser::extractQuotedString(const std::string &szString, long &nPos)
00593 {
00594     std::string szRetval;
00595     const long KnLength = szString.length();
00596     char cQuoteChar = 0;
00597     long nQuotePos = 0;
00598     long nEndQuotePos = 0;
00599     nQuotePos = szString.find_first_of( g_KszQuoteTypes, nPos );
00600 
00601     if ( std::string::npos != nQuotePos )
00602     {
00603         cQuoteChar = szString[nQuotePos];
00604         nEndQuotePos = szString.find( cQuoteChar, nQuotePos + 1 );
00605         szRetval = szString.substr( nQuotePos + 1, nEndQuotePos - nQuotePos - 1 );
00606         nPos = nEndQuotePos + 1;
00607     }
00608     return szRetval;
00609 }
00610 
00611 SOAPParser::XMLNStoURN& SOAPParser::getNamespacesInUse()
00612 {
00613     return m_namespaceMap;
00614 }
00615 
00616 void SOAPParser::setFailed()
00617 {
00618     m_pFault = std::auto_ptr<SOAPFault>( new SOAPFault );
00619 }
00620 
00621 SOAPFault* SOAPParser::getFault()
00622 {
00623     // Don't create it if we didn't need it.
00624     return m_pFault.get();
00625 }