
Building an Easy-to-Use Parser/Parsing Framework (Part II) (Views: 5739)
| Problem/Question/Abstract: How to create a simple parsing framework to parse any kind of data? Answer: Welcome to the second part of my article "Building an Easy-to-Use Parser/Parsing Framework". This time, I want to show you how to create a real working dtd parser as exemplified in the first part. If you don't read my first article, please make up for this now: Building an Easy-to-Use Parser/Parsing Framework (Part I) As mentioned earlier, we need a dtd document which holds up all our parsed informations in an easy-to-access object model. Take a look at the following interface section: type { TDTDAttributeTyp } TDTDAttributeTyp = (atData, atID, atIDRef, atEnumeration); { TDTDAttributeStatus } TDTDAttributeStatus = (asDefault, asImplied, asRequired, asFixed); { TDTDChildTyp } TDTDChildTyp = (ctElement, ctChoice, ctSequence); { TDTDElementTyp } TDTDElementTyp = (etAny, etEmpty, etData, etContainer); { TDTDElementStatus } TDTDElementStatus = (esRequired, esRequiredSeq, esOptional, esOptionalSeq); { TDTDItem } TDTDItem = class(TCollectionItem) private { Private declarations } FName: string; public { Public declarations } procedure Assign(Source: TPersistent); override; published { Published declarations } property Name: string read FName write FName; end; { TDTDItems } TDTDItems = class(TCollection) private { Private declarations } function GetItem(Index: Integer): TDTDItem; procedure SetItem(Index: Integer; Value: TDTDItem); public { Public declarations } function Add: TDTDItem; function Find(const Name: string): TDTDItem; property Items[Index: Integer]: TDTDItem read GetItem write SetItem; default; end; { TDTDEntity } TDTDEntity = class(TDTDItem) private { Private declarations } public { Public declarations } procedure Assign(Source: TPersistent); override; published { Published declarations } end; { TDTDEntities } TDTDEntities = class(TDTDItems) private { Private declarations } function GetItem(Index: Integer): TDTDEntity; procedure SetItem(Index: Integer; Value: TDTDEntity); public { Public declarations } function Add: TDTDEntity; function Find(const Name: string): TDTDEntity; property Items[Index: Integer]: TDTDEntity read GetItem write SetItem; default; end; { TDTDEnum } TDTDEnum = class(TDTDItem) private { Private declarations } public { Public declarations } procedure Assign(Source: TPersistent); override; published { Published declarations } end; { TDTDEnums } TDTDEnums = class(TDTDItems) private { Private declarations } function GetItem(Index: Integer): TDTDEnum; procedure SetItem(Index: Integer; Value: TDTDEnum); public { Public declarations } function Add: TDTDEnum; function Find(const Name: string): TDTDEnum; property Items[Index: Integer]: TDTDEnum read GetItem write SetItem; default; end; { TDTDAttribute } TDTDAttribute = class(TDTDItem) private { Private declarations } FTyp: TDTDAttributeTyp; FStatus: TDTDAttributeStatus; FDefault: string; FEnums: TDTDEnums; procedure SetEnums(Value: TDTDEnums); public { Public declarations } constructor Create(Collection: TCollection); override; destructor Destroy; override; procedure Assign(Source: TPersistent); override; published { Published declarations } property Typ: TDTDAttributeTyp read FTyp write FTyp; property Status: TDTDAttributeStatus read FStatus write FStatus; property Default: string read FDefault write FDefault; property Enums: TDTDEnums read FEnums write SetEnums; end; { TDTDAttributes } TDTDAttributes = class(TDTDItems) private { Private declarations } function GetItem(Index: Integer): TDTDAttribute; procedure SetItem(Index: Integer; Value: TDTDAttribute); public { Public declarations } function Add: TDTDAttribute; function Find(const Name: string): TDTDAttribute; property Items[Index: Integer]: TDTDAttribute read GetItem write SetItem; default; end; { TDTDProperty } TDTDProperty = class(TDTDItem) private { Private declarations } FStatus: TDTDElementStatus; public { Public declarations } procedure Assign(Source: TPersistent); override; published { Published declarations } property Status: TDTDElementStatus read FStatus write FStatus; end; { TDTDProperties} TDTDProperties = class(TDTDItems) private { Private declarations } function GetItem(Index: Integer): TDTDProperty; procedure SetItem(Index: Integer; Value: TDTDProperty); public { Public declarations } function Add: TDTDProperty; function Find(const Name: string): TDTDProperty; property Items[Index: Integer]: TDTDProperty read GetItem write SetItem; default; end; { TDTDChild } TDTDChilds = class; TDTDChild = class(TDTDProperty) private { Private declarations } FTyp: TDTDChildTyp; FChilds: TDTDChilds; procedure SetChilds(const Value: TDTDChilds); public { Public declarations } constructor Create(Collection: TCollection); override; destructor Destroy; override; procedure Assign(Source: TPersistent); override; published { Published declarations } property Typ: TDTDChildTyp read FTyp write FTyp; property Childs: TDTDChilds read FChilds write SetChilds; end; { TDTDChilds} TDTDChilds = class(TDTDProperties) private { Private declarations } function GetItem(Index: Integer): TDTDChild; procedure SetItem(Index: Integer; Value: TDTDChild); public { Public declarations } function Add: TDTDChild; function Find(const Name: string): TDTDChild; property Items[Index: Integer]: TDTDChild read GetItem write SetItem; default; end; { TDTDElement } TDTDElement = class(TDTDProperty) private { Private declarations } FTyp: TDTDElementTyp; FAttributes: TDTDAttributes; FChilds: TDTDChilds; procedure SetAttributes(Value: TDTDAttributes); procedure SetChilds(Value: TDTDChilds); public { Public declarations } constructor Create(Collection: TCollection); override; destructor Destroy; override; procedure Assign(Source: TPersistent); override; published { Published declarations } property Typ: TDTDElementTyp read FTyp write FTyp; property Attributes: TDTDAttributes read FAttributes write SetAttributes; property Childs: TDTDChilds read FChilds write SetChilds; end; { TDTDElements } TDTDElements = class(TDTDProperties) private { Private declarations } function GetItem(Index: Integer): TDTDElement; procedure SetItem(Index: Integer; Value: TDTDElement); public { Public declarations } function Add: TDTDElement; function Find(const Name: string): TDTDElement; property Items[Index: Integer]: TDTDElement read GetItem write SetItem; default; end; { TDTDDocument } TDTDDocument = class(TPersistent) private { Private declarations } FEntities: TDTDEntities; FElements: TDTDElements; procedure SetEntities(Value: TDTDEntities); procedure SetElements(Value: TDTDElements); public { Public declarations } constructor Create; destructor Destroy; override; procedure Assign(Source: TPersistent); override; published { Published declarations } property Entities: TDTDEntities read FEntities write SetEntities; property Elements: TDTDElements read FElements write SetElements; end; This model implements all needed objects to parse a dtd file. Notice, that not all dtd grammars are reflected in this model, it's up to you to improve my work - but it's enough to parse all standard dtd files. Next, we need to create our dtd parser, which will be inherited by TValidationParser as professed in Part I: type { EDTDParser } EDTDParser = class(Exception); { TDTDParser } TDTDParser = class(TValidationParser) private { Private declarations } procedure ParseElement(Parser: TStringParser; Document: TDTDDocument; const Pass: Integer); procedure ParseAttlist(Parser: TStringParser; Document: TDTDDocument); procedure ParseFile(const FileName: string; Document: TDTDDocument; const Pass: Integer = 0); public { Public declarations } procedure Parse(const FileName: string; var Document: TDTDDocument); end; The new exception class EDTDParser will be raised, if the passed filename is physical not available. One of the weightily methods is Parse. The first parameter must be an existing filename of the dtd file to be parsed. The second parameter is the document which holds our object model and must be pre-initialized. The implementation of this method is as followed: 01. procedure TDTDParser.Parse(const FileName: string; var Document: TDTDDocument); 02. var 03. TmpDocument: TDTDDocument; 04. begin 05. if not assigned(Document) then 06. raise EDTDParser.Create('Document not assigned!'); 07. TmpDocument := TDTDDocument.Create; 08. try 09. ParseFile(FileName, TmpDocument); 10. if Errors.Count = 0 then 11. Document.Assign(TmpDocument); 12. finally 13. TmpDocument.Free; 14. end; 15. end; In Line 5 we're looking if the passed document was successfully initialized; if not, an exception (EDTDParser) will be raised. After comparing that, we create a new temporary instance of a dtd document (Line 7) and parse the passed filename (Line 9). If no errors occured (Line 10) we make a copy of the filled dtd document by assigning it to the passed one (Line 11). Consecutively we take a look to the ParseFile procedure, which initializes the main parsing process and looks for the basic keywords: (Note: The italic lines are not part of the sourcecode - they are used to explain the unique sections) procedure TDTDParser.ParseFile(const FileName: string; Document: TDTDDocument; const Pass: Integer = 0); var Parser: TStringParser; begin {Create a new instance of the TStringParser.} Parser := TStringParser.Create; try {Check, if the passed filename already exists.} if not Parser.LoadFromFile(FileName) then begin AddErrorFmt('File "%s" not found', [FileName], Parser); Exit; end; {Initialize an endless loop.} while True do begin {Skip to the next valid Tag-Begin-Token "<" or EOF.} while not (Parser.Token in [toEOF, '<']) do Parser.SkipToken; {Break look, if current Token is EOF - End of File.} if Parser.Token = toEOF then Break; {Get the next Token - after Tag-Begin "<".} Parser.SkipToken; {Check for valid identification Tag "!" or "?".} if Parser.Token <> '!' then begin {Only add an error if the current Pass is one "1".} if not (Parser.Token in ['?']) and (Pass = 1) then AddError('InvalidToken', Parser); Continue; end; {Check for valid Symbol or Comment Line.} if Parser.SkipToken <> toSymbol then begin if (Parser.Token <> '-') and (Pass = 1) then AddError('InvalidToken', Parser); Continue; end; {Check for "Entity" Tag.} if UpperCase(Parser.TokenString) = 'ENTITY' then Continue; {Check for "Element" Tag.} if UpperCase(Parser.TokenString) = 'ELEMENT' then ParseElement(Parser, Document, Pass) else {Check for "Attribute" Tag.} if UpperCase(Parser.TokenString) = 'ATTLIST' then begin if Pass = 1 then ParseAttlist(Parser, Document); end {Add an error on invalid Symbols.} else if Pass = 1 then AddErrorFmt('Invalid Symbol "%s"', [Parser.TokenString], Parser); end; {Initialize Pass 2 - if currently finished Pass 1.} if Pass = 0 then ParseFile(FileName, Document, 1); finally Parser.Free; end; end; The ParseFile method simply starts parsing the main structure of a dtd file and tries to extract some basic keywords like Entity, Element or Attribute. If one of the last two keywords were found, a special (ParseElement or ParseAttlist) method is called to create the corresponding object and to extract additional informations. If the parser founds any syntax or grammar errors, respectively items are created. The method ParseElement includes the functionality to parse and extract further informations, like Type or Rule: (Note: The italic lines are not part of the sourcecode - they are used to explain the unique sections) procedure TDTDParser.ParseElement(Parser: TStringParser; Document: TDTDDocument; const Pass: Integer); var Element: TDTDElement; Child: TDTDChild; Rule: string; begin {Get the next Token.} Parser.SkipToken; {On first pass, create a new element.} if Pass = 0 then Element := Document.Elements.Add {On second pass, find previous created element.} else Element := Document.Elements.Find(Parser.TokenString); {Set the new element name.} Element.Name := Parser.TokenString; try {Add an error if the current Token isn't a symbol.} if Parser.Token <> toSymbol then Abort; {Check for element rule, like "any", "empty" or "sequence"...} Rule := UpperCase(Parser.SkipTokenString); {...Found Rule: "ANY".} if (Rule = 'ANY') and (Parser.SkipToken = '>') then begin Element.Typ := etAny; Exit; end; {...Found Rule: "EMPTY".} if (Rule = 'EMPTY') and (Parser.SkipToken = '>') then begin Element.Typ := etEmpty; Exit; end; if (Rule = '(') then begin {...Found Rule: "PCDATA".} if Parser.SkipToken in [toEOF, '>'] then Abort; if Parser.Token = '#' then begin if UpperCase(Parser.SkipToToken('>')) = 'PCDATA)' then begin Element.Typ := etData; Exit; end; Abort; end; {...Found Rule: "sequence/container".} Element.Typ := etContainer; repeat {Create Child objects, if pass = 1.} Child := nil; if not (Parser.Token in ['|', ',', ')']) then begin if Pass = 0 then begin Child := Element.Childs.Add; Child.Name := Parser.TokenString; Child.Typ := ctElement; end else if Document.Elements.Find(Parser.TokenString) = nil then AddErrorFmt('Invalid Element Target "%s"', [Parser.TokenString], Parser); end; Parser.SkipToken; {Check Child Status (=sequence style).} if Parser.Token in ['+', '?', '*'] then begin if Child <> nil then case Parser.Token of '+': Child.Status := esRequiredSeq; '?': Child.Status := esOptional; '*': Child.Status := esOptionalSeq; end; Parser.SkipToken; end; until Parser.SkipToken in [toEOF, '>']; Exit; end; {Add an error only on pass 1.} if Pass = 1 then AddErrorFmt('Invalid Element Rule "%s"', [Rule], Parser); except {Add an error only on pass 1.} if Pass = 1 then AddError('InvalidElementFormat', Parser); end; end; The method ParseAttlist includes the functionality to parse and extract further informations, like Type or Enumerations: (Note: The italic lines are not part of the sourcecode - they are used to explain the unique sections) procedure TDTDParser.ParseAttlist(Parser: TStringParser; Document: TDTDDocument); var Attribute: TDTDAttribute; Element: TDTDElement; Target, Typ: string; begin {Get the next Token.} Target := Parser.SkipTokenString; try {Add an error if the current Token isn't a symbol.} if Parser.Token <> toSymbol then Abort; {Try to find the element target.} Element := Document.Elements.Find(Target); {Add an error if no element was found.} if Element = nil then begin AddErrorFmt('Invalid Element Target "%s"', [Target], Parser); Exit; end; {Get the next Token.} Parser.SkipToken; repeat {Add an error if the current Token isn't a symbol.} if Parser.Token <> toSymbol then Abort; {Create a new Attribute under the located element.} Attribute := Element.Attributes.Add; {Set the new name.} Attribute.Name := Parser.TokenString; {Check for Attribute Type...} Typ := Parser.SkipTokenString; {...Found Type "CDDATA".} if UpperCase(Typ) = 'CDATA' then Attribute.Typ := atData else {...Found Type "ID".} if UpperCase(Typ) = 'ID' then Attribute.Typ := atID else {...Found Type "IDREF".} if UpperCase(Typ) = 'IDREF' then Attribute.Typ := atIDRef else {...Found Type "enumeration".} if Typ = '(' then begin Attribute.Typ := atEnumeration; {Seperate enumeration parts and attach them} {to the parent attribute.} repeat Parser.SkipToken; if not (Parser.Token in ['|', ')']) then Attribute.Enums.Add.Name := Parser.TokenString; until Parser.Token in [toEOF, ')']; {Add an error, if current token is "EOF".} if Parser.Token = toEOF then begin AddErrorFmt('Invalid Enumeration End in Attribute "%s"', [Attribute.Name], Parser); Exit; end; end else begin AddErrorFmt('Invalid Attribute Typ "%s"', [Typ], Parser); Exit; end; {Check for Restrictions...} Parser.SkipToken; if Parser.Token = '#' then begin {...Found Restriction "IMPLIED".} Typ := UpperCase(Parser.SkipTokenString); if Typ = 'IMPLIED' then begin Attribute.Status := asImplied; Parser.SkipToken; end; {...Found Restriction "REQUIRED".} if Typ = 'REQUIRED' then begin Attribute.Status := asRequired; Parser.SkipToken; end; {...Found Restriction "FIXED".} if Typ = 'FIXED' then begin Attribute.Status := asFixed; Parser.SkipToken; end; end; {Extract an optional default value.} if Parser.Token = '"' then begin if Attribute.Status = asImplied then Abort; Attribute.Default := Trim(Parser.SkipToToken('"')); Parser.SkipToken; end; until Parser.Token = '>'; except AddErrorFmt('Invalid Attribute Format "%s"', [Target], Parser); end; end; Note: The above methods only detects simple dtd grammas. To parse all possible tags and additional grammars you had to include a more complex algorithm to do that - for our purposes (and this article) it's enough. If you are not familiar with the dtd syntax, check out the site W3Schools. Okay, at this point we have finished our object-model and parser implementation. All we need now is an example application which will take use of this units. Our demo application will parse a dtd file, detects the structure and creates a simple xml output with a given startup node. Take a look at the following dtd: name CDATA #REQUIRED value CDATA #REQUIRED > Type (Error | Warning | Information) #REQUIRED > Our demo application will create the following xml output: In this case, the startup node is BeratungsKontextResp which will be used as the root node for all other nodes. Our example is implemented as a console application as followed: program dtd2xml; {$APPTYPE CONSOLE} uses SysUtils, DTD_Parser in 'DTD_Parser.pas', DTD_Document in 'DTD_Document.pas', StringParser in 'StringParser.pas', PrivateParser in 'PrivateParser.pas'; var FileName: string; Switch_XMLRoot: string; Switch_XMLData: Boolean; Switch_RootLst: Boolean; DTDDocument: TDTDDocument; DTDParser: TDTDParser; RootElement: TDTDElement; i: Integer; {----------------------------------------------------------------------------- Procedure: FindCmdSwitch Author: mh Date: 23-Jan-2002 Arguments: const Switch: string; const Default: string = '' Result: string -----------------------------------------------------------------------------} function FindCmdSwitch(const Switch: string; const Default: string = ''): string; var i: Integer; begin Result := ''; for i := 1 to ParamCount do if UpperCase(Copy(ParamStr(i), 1, Length(Switch))) = UpperCase(Switch) then begin Result := Copy(ParamStr(i), Length(Switch) + 1, MAXINT); Exit; end; if Result = '' then Result := Default; end; {----------------------------------------------------------------------------- Procedure: WriteXML Author: mh Date: 23-Jan-2002 Arguments: const AElement: TDTDElement; const AStatus: TDTDElementStatus; Indent: Integer = 0 Result: None -----------------------------------------------------------------------------} procedure WriteXML(const AElement: TDTDElement; const AStatus: TDTDElementStatus; Indent: Integer = 0); var i: Integer; Spacer, Def: string; begin for i := 1 to Indent * 2 do Spacer := Spacer + #32; Write(Spacer + '<' + AElement.Name); for i := 0 to AElement.Attributes.Count - 1 do with AElement.Attributes[i] do begin Def := Default; if (Switch_XMLData) and (Def = '') then begin if Typ = atEnumeration then begin if Enums.Count > 0 then Def := Enums[0].Name else Def := '???'; end else Def := Name; end; Write(Format(' %s="%s"', [Name, Def])); end; if AElement.Typ <> etContainer then begin Def := ''; if (Switch_XMLData) and (AElement.Typ <> etEmpty) then Def := AElement.Name; WriteLn(Format('>%s', [Def, AElement.Name])); end else WriteLn('>'); for i := 0 to AElement.Childs.Count - 1 do WriteXML(DTDDocument.Elements.Find(AElement.Childs[i].Name), AElement.Childs[i].Status, Indent + 1); if AElement.Typ = etContainer then WriteLn(Spacer + Format('', [AElement.Name])); end; {----------------------------------------------------------------------------- Procedure: main Author: mh Date: 23-Jan-2002 Arguments: None Result: None -----------------------------------------------------------------------------} begin // display usage. if (ParamCount = 0) or (FindCmdSwitch('-?', '?') <> '?') then begin WriteLn(''); WriteLn('dtd2xml (parser framework example) version 1.0'); WriteLn('(w)ritten 2002 by Marc Hoffmann. GNU License'); WriteLn(''); WriteLn('Usage: dtd2xml [options] [-?]'); WriteLn(''); WriteLn('Options:'); WriteLn('-xmlroot= XML root element (? = possible elements)'); WriteLn('-xmldata=yes|no Include XML Example data (default = yes)'); WriteLn(''); Exit; end; // exract filename. FileName := ParamStr(1); // append default extenstion, if ExtractFileExt(FileName) = '' then FileName := ChangeFileExt(FileName, '.dtd'); // file exists? if not FileExists(FileName) then begin WriteLn(Format('Fatal: File not found ''%s''.', [FileName])); Exit; end; // extract command-line switches. Switch_RootLst := FindCmdSwitch('-xmlroot=') = '?'; Switch_XMLRoot := FindCmdSwitch('-xmlroot='); Switch_XMLData := UpperCase(FindCmdSwitch('-xmldata=')) <> 'NO'; // create new dtd-document. DTDDocument := TDTDDocument.Create; try // create new dtd-parser. DTDParser := TDTDParser.Create; try // parse file. DTDParser.Parse(FileName, DTDDocument); // display possible errors. if DTDParser.Errors.Count > 0 then begin for i := 0 to DTDParser.Errors.Count - 1 do with DTDParser.Errors[i] do WriteLn(Format('Error in Line %d, Pos %d: %s...', [Line, Position, Message])); Exit; end; // search rootelement. RootElement := DTDDocument.Elements.Find(Switch_XMLRoot); // display rootelements & assign possible object. for i := 0 to DTDDocument.Elements.Count - 1 do if DTDDocument.Elements[i].Typ = etContainer then begin if Switch_RootLst then WriteLn(DTDDocument.Elements[i].Name) else if (Switch_XMLRoot = '') and ((RootElement = nil) or ((RootElement <> nil) and (RootElement.Childs.Count < DTDDocument.Elements[i].Childs.Count))) then RootElement := DTDDocument.Elements[i]; end; // exit app if rootlist-switch was set. if Switch_RootLst then Exit; // exit app if rootelement is NIL. if RootElement = nil then begin WriteLn(Format('Fatal: Root Element ''%s'' not found.', [Switch_XMLRoot])); Exit; end; // exit app if rootelement is invalid. if RootElement.Typ <> etContainer then begin WriteLn(Format('Fatal: ''%s'' is not a valid Root Element.', [Switch_XMLRoot])); Exit; end; // write xml output. WriteLn(Format('' + #13 + '', [RootElement.Name, ExtractFileName(FileName)])); WriteLn(''); WriteXML(RootElement, RootElement.Status); // free dtd-parser. finally DTDParser.Free; end; // free dtd-document. finally DTDDocument.Free; end; end. |