Callback for freeing some parser input allocations.
str :
the string to deallocate
struct xmlParserInput
struct xmlParserInput {
/* Input buffer */
xmlParserInputBufferPtr buf; /* UTF-8 encoded buffer */
const char *filename; /* The file analyzed, if any */
const char *directory; /* the directory/base of the file */
const xmlChar *base; /* Base of the array to parse */
const xmlChar *cur; /* Current char being parsed */
const xmlChar *end; /* end of the array to parse */
int length; /* length if known */
int line; /* Current line */
int col; /* Current column */
int consumed; /* How many xmlChars already consumed */
xmlParserInputDeallocate free; /* function to deallocate the base */
const xmlChar *encoding; /* the encoding string for entity */
const xmlChar *version; /* the version string for entity */
int standalone; /* Was that entity marked standalone */
};
An xmlParserInput is an input flow for the XML processor.
Each entity parsed is associated an xmlParserInput (except the
few predefined ones). This is the case both for internal entities
- in which case the flow is already completely in memory - or
external entities - in which case we use the buf structure for
progressive reading and I18N conversions to the internal UTF-8 format.
struct xmlParserNodeInfo
struct xmlParserNodeInfo {
const struct _xmlNode* node;
/* Position & line # that text that created the node begins & ends on */
unsigned long begin_pos;
unsigned long begin_line;
unsigned long end_pos;
unsigned long end_line;
};
The parser can be asked to collect Node informations, i.e. at what
place in the file they were detected.
NOTE: This is off by default and not very well tested.
xmlParserNodeInfoPtr
typedef xmlParserNodeInfo *xmlParserNodeInfoPtr;
struct xmlParserNodeInfoSeq
struct xmlParserNodeInfoSeq {
unsigned long maximum;
unsigned long length;
xmlParserNodeInfo* buffer;
};
typedef enum {
XML_PARSER_EOF = -1, /* nothing is to be parsed */
XML_PARSER_START = 0, /* nothing has been parsed */
XML_PARSER_MISC, /* Misc* before int subset */
XML_PARSER_PI, /* Within a processing instruction */
XML_PARSER_DTD, /* within some DTD content */
XML_PARSER_PROLOG, /* Misc* after internal subset */
XML_PARSER_COMMENT, /* within a comment */
XML_PARSER_START_TAG, /* within a start tag */
XML_PARSER_CONTENT, /* within the content */
XML_PARSER_CDATA_SECTION, /* within a CDATA section */
XML_PARSER_END_TAG, /* within a closing tag */
XML_PARSER_ENTITY_DECL, /* within an entity declaration */
XML_PARSER_ENTITY_VALUE, /* within an entity value in a decl */
XML_PARSER_ATTRIBUTE_VALUE, /* within an attribute value */
XML_PARSER_SYSTEM_LITERAL, /* within a SYSTEM value */
XML_PARSER_EPILOG, /* the Misc* after the last end tag */
XML_PARSER_IGNORE, /* within an IGNORED section */
XML_PARSER_PUBLIC_LITERAL /* within a PUBLIC value */
} xmlParserInputState;
The parser is now working also as a state based parser.
The recursive one use the state info for entities processing.
XML_DETECT_IDS
#define XML_DETECT_IDS 2
Bit in the loadsubset context field to tell to do ID/REFs lookups.
Use it to initialize xmlLoadExtDtdDefaultValue.
XML_COMPLETE_ATTRS
#define XML_COMPLETE_ATTRS 4
Bit in the loadsubset context field to tell to do complete the
elements attributes lists with the ones defaulted from the DTDs.
Use it to initialize xmlLoadExtDtdDefaultValue.
struct xmlParserCtxt
struct xmlParserCtxt {
struct _xmlSAXHandler *sax; /* The SAX handler */
void *userData; /* For SAX interface only, used by DOM build */
xmlDocPtr myDoc; /* the document being built */
int wellFormed; /* is the document well formed */
int replaceEntities; /* shall we replace entities ? */
const xmlChar *version; /* the XML version string */
const xmlChar *encoding; /* the declared encoding, if any */
int standalone; /* standalone document */
int html; /* an HTML(1)/Docbook(2) document */
/* Input stream stack */
xmlParserInputPtr input; /* Current input stream */
int inputNr; /* Number of current input streams */
int inputMax; /* Max number of input streams */
xmlParserInputPtr *inputTab; /* stack of inputs */
/* Node analysis stack only used for DOM building */
xmlNodePtr node; /* Current parsed Node */
int nodeNr; /* Depth of the parsing stack */
int nodeMax; /* Max depth of the parsing stack */
xmlNodePtr *nodeTab; /* array of nodes */
int record_info; /* Whether node info should be kept */
xmlParserNodeInfoSeq node_seq; /* info about each node parsed */
int errNo; /* error code */
int hasExternalSubset; /* reference and external subset */
int hasPErefs; /* the internal subset has PE refs */
int external; /* are we parsing an external entity */
int valid; /* is the document valid */
int validate; /* shall we try to validate ? */
xmlValidCtxt vctxt; /* The validity context */
xmlParserInputState instate; /* current type of input */
int token; /* next char look-ahead */
char *directory; /* the data directory */
/* Node name stack */
xmlChar *name; /* Current parsed Node */
int nameNr; /* Depth of the parsing stack */
int nameMax; /* Max depth of the parsing stack */
xmlChar * *nameTab; /* array of nodes */
long nbChars; /* number of xmlChar processed */
long checkIndex; /* used by progressive parsing lookup */
int keepBlanks; /* ugly but ... */
int disableSAX; /* SAX callbacks are disabled */
int inSubset; /* Parsing is in int 1/ext 2 subset */
xmlChar * intSubName; /* name of subset */
xmlChar * extSubURI; /* URI of external subset */
xmlChar * extSubSystem; /* SYSTEM ID of external subset */
/* xml:space values */
int * space; /* Should the parser preserve spaces */
int spaceNr; /* Depth of the parsing stack */
int spaceMax; /* Max depth of the parsing stack */
int * spaceTab; /* array of space infos */
int depth; /* to prevent entity substitution loops */
xmlParserInputPtr entity; /* used to check entities boundaries */
int charset; /* encoding of the in-memory content
actually an xmlCharEncoding */
int nodelen; /* Those two fields are there to */
int nodemem; /* Speed up large node parsing */
int pedantic; /* signal pedantic warnings */
void *_private; /* For user data, libxml won't touch it */
int loadsubset; /* should the external subset be loaded */
int linenumbers; /* set line number in element content */
void *catalogs; /* document's own catalog */
};
The parser context.
NOTE This doesn't completely define the parser state, the (current ?)
design of the parser uses recursive function calls since this allow
and easy mapping from the production rules of the specification
to the actual code. The drawback is that the actual function call
also reflect the parser state. However most of the parsing routines
takes as the only argument the parser context pointer, so migrating
to a state based parser for progressive parsing shouldn't be too hard.
struct xmlSAXLocator
struct xmlSAXLocator {
const xmlChar *(*getPublicId)(void *ctx);
const xmlChar *(*getSystemId)(void *ctx);
int (*getLineNumber)(void *ctx);
int (*getColumnNumber)(void *ctx);
};
Callback:
The entity loader, to control the loading of external entities,
the application can either:
- override this resolveEntity() callback in the SAX block
- or better use the xmlSetExternalEntityLoader() function to
set up it's own entity resolution routine
ctx :
the user data (XML parser context)
publicId :
The public ID of the entity
systemId :
The system ID of the entity
Returns :
the xmlParserInputPtr if inlined or NULL for DOM behaviour.
Handle an attribute that has been read by the parser.
The default handling is to convert the attribute into an
DOM subtree and past it in a new xmlAttr element added to
the element.
Global variable controlling the entity substitution default behavior.
xmlInitParser ()
void xmlInitParser (void);
Initialization function for the XML parser.
This is not reentrant. Call once before processing in case of
use in multithreaded programs.
xmlCleanupParser ()
void xmlCleanupParser (void);
Cleanup function for the XML parser. It tries to reclaim all
parsing related global memory allocated for the parser processing.
It doesn't deallocate any document related memory. Calling this
function should not prevent reusing the parser.
a strdup for array of xmlChar's. Since they are supposed to be
encoded in UTF-8 or an encoding with 8bit based chars, we assume
a termination mark of '0'.
a strcat for array of xmlChar's. Since they are supposed to be
encoded in UTF-8 or an encoding with 8bit based chars, we assume
a termination mark of '0'.
cur :
the original xmlChar * array
add :
the xmlChar * array added
Returns :
a new xmlChar * containing the concatenated string.
parse an XML file and build a tree. Automatic support for ZLIB/Compress
compressed document is provided by default if found at compile-time.
filename :
the filename
Returns :
the resulting document tree if the file was wellformed,
NULL otherwise.
xmlSubstituteEntitiesDefault ()
int xmlSubstituteEntitiesDefault (int val);
Set and return the previous value for default entity support.
Initially the parser always keep entity references instead of substituting
entity values in the output. This function has to be used to change the
default parser behavior
SAX::substituteEntities() has to be used for changing that on a file by
file basis.
val :
int 0 or 1
Returns :
the last value for 0 for no substitution, 1 for substitution.
xmlKeepBlanksDefault ()
int xmlKeepBlanksDefault (int val);
Set and return the previous value for default blanks text nodes support.
The 1.x version of the parser used an heuristic to try to detect
ignorable white spaces. As a result the SAX callback was generating
ignorableWhitespace() callbacks instead of characters() one, and when
using the DOM output text nodes containing those blanks were not generated.
The 2.x and later version will switch to the XML standard way and
ignorableWhitespace() are only generated when running the parser in
validating mode and when the current element doesn't allow CDATA or
mixed content.
This function is provided as a way to force the standard behavior
on 1.X libs and to switch back to the old mode for compatibility when
running 1.X client code on 2.X . Upgrade of 1.X code should be done
by using xmlIsBlankNode() commodity function to detect the "empty"
nodes generated.
This value also affect autogeneration of indentation when saving code
if blanks sections are kept, indentation is not generated.
val :
int 0 or 1
Returns :
the last value for 0 for no substitution, 1 for substitution.
parse an XML file and build a tree. Automatic support for ZLIB/Compress
compressed document is provided by default if found at compile-time.
In the case the document is not Well Formed, a tree is built anyway
parse an XML in-memory document and build a tree.
It use the given SAX function block to handle the parsing callback.
If sax is NULL, fallback to the default DOM tree building routines.
sax :
the SAX handler block
cur :
a pointer to an array of xmlChar
recovery :
work in recovery mode, i.e. tries to read no Well Formed
documents
Returns :
the resulting document tree
xmlSAXUserParseFile ()
int xmlSAXUserParseFile (xmlSAXHandlerPtr sax,
void *user_data,
const char *filename);
parse an XML file and call the given SAX handler routines.
Automatic support for ZLIB/Compress compressed document is provided
sax :
a SAX handler
user_data :
The user data returned on SAX callbacks
filename :
a file name
Returns :
0 in case of success or a error number otherwise
xmlSAXUserParseMemory ()
int xmlSAXUserParseMemory (xmlSAXHandlerPtr sax,
void *user_data,
const char *buffer,
int size);
A better SAX parsing routine.
parse an XML in-memory buffer and call the given SAX handler routines.
parse an XML in-memory block and use the given SAX function block
to handle the parsing callback. If sax is NULL, fallback to the default
DOM tree building routines.
sax :
the SAX handler block
buffer :
an pointer to a char array
size :
the size of the array
recovery :
work in recovery mode, i.e. tries to read not Well Formed
documents
parse an XML file and build a tree. Automatic support for ZLIB/Compress
compressed document is provided by default if found at compile-time.
It use the given SAX function block to handle the parsing callback.
If sax is NULL, fallback to the default DOM tree building routines.
sax :
the SAX handler block
filename :
the filename
recovery :
work in recovery mode, i.e. tries to read no Well Formed
documents
parse an XML file and build a tree. Automatic support for ZLIB/Compress
compressed document is provided by default if found at compile-time.
It use the given SAX function block to handle the parsing callback.
If sax is NULL, fallback to the default DOM tree building routines.
User data (void *) is stored within the parser context in the
context's _private member, so it is available nearly everywhere in libxml
sax :
the SAX handler block
filename :
the filename
recovery :
work in recovery mode, i.e. tries to read no Well Formed
documents
parse an XML external entity out of context and build a tree.
It use the given SAX function block to handle the parsing callback.
If sax is NULL, fallback to the default DOM tree building routines.
Parse a well-balanced chunk of an XML document
called by the parser
The allowed sequence for the Well Balanced Chunk is the one defined by
the content production in the XML grammar:
Parse a well-balanced chunk of an XML document
called by the parser
The allowed sequence for the Well Balanced Chunk is the one defined by
the content production in the XML grammar:
The user data returned on SAX callbacks (possibly NULL)
depth :
Used for loop detection, use 0
string :
the input string in UTF8 or ISO-Latin (zero terminated)
lst :
the return value for the set of parsed nodes
recover :
return nodes even if the data is broken (use 0)
Returns :
0 if the chunk is well balanced, -1 in case of args problem and
the parser error code otherwise
In case recover is set to 1, the nodelist will not be empty even if
the parsed chunk is not well balanced.
Parse an external general entity within an existing parsing context
An external general parsed entity is well-formed if it matches the
production labeled extParsedEnt.
[78] extParsedEnt ::= TextDecl? content
ctx :
the existing parsing context
URL :
the URL for the entity to load
ID :
the System ID for the entity to load
lst :
the return value for the set of parsed nodes
Returns :
0 if the entity is well formed, -1 in case of args problem and
the parser error code otherwise
Setup the parser context to parse a new buffer; Clears any prior
contents from the parser context. The buffer parameter must not be
NULL, but the filename parameter can be
Create a parser context for using the XML parser in push mode
To allow content encoding detection, size should be >= 4
The value of filename is used for fetching external entities
and error/warning reports.
sax :
a SAX handler
user_data :
The user data returned on SAX callbacks
chunk :
a pointer to an array of chars
size :
number of chars in the array
filename :
an optional file name or URI
Returns :
the new parser context or NULL
xmlParseChunk ()
int xmlParseChunk (xmlParserCtxtPtr ctxt,
const char *chunk,
int size,
int terminate);
Load an external entity, note that the use of this function for
unparsed entities may generate problems
TODO: a more generic External entity API must be designed