summaryrefslogtreecommitdiff
path: root/impl/antlr/libantlr3c-3.4/include/antlr3lexer.h
blob: 6cc10efad2237b40500b9a204cc1fb3c354f7a8a (about) (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
/** \file 
 * Base interface for any ANTLR3 lexer.
 *
 * An ANLTR3 lexer builds from two sets of components:
 *
 *  - The runtime components that provide common functionality such as
 *    traversing character streams, building tokens for output and so on.
 *  - The generated rules and struutre of the actual lexer, which call upon the
 *    runtime components.
 *
 * A lexer class contains  a character input stream, a base recognizer interface 
 * (which it will normally implement) and a token source interface (which it also
 * implements. The Tokensource interface is called by a token consumer (such as
 * a parser, but in theory it can be anything that wants a set of abstract
 * tokens in place of a raw character stream.
 *
 * So then, we set up a lexer in a sequence akin to:
 *
 *  - Create a character stream (something which implements ANTLR3_INPUT_STREAM)
 *    and initialize it.
 *  - Create a lexer interface and tell it where it its input stream is.
 *    This will cause the creation of a base recognizer class, which it will 
 *    override with its own implementations of some methods. The lexer creator
 *    can also then in turn override anything it likes. 
 *  - The lexer token source interface is then passed to some interface that
 *    knows how to use it, byte calling for a next token. 
 *  - When a next token is called, let ze lexing begin.
 *
 */
#ifndef	_ANTLR3_LEXER
#define	_ANTLR3_LEXER

// [The "BSD licence"]
// Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
// http://www.temporal-wave.com
// http://www.linkedin.com/in/jimidle
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
// 3. The name of the author may not be used to endorse or promote products
//    derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

/* Definitions
 */
#define	ANTLR3_STRING_TERMINATOR	0xFFFFFFFF

#include    <antlr3defs.h>
#include    <antlr3input.h>
#include    <antlr3commontoken.h>
#include    <antlr3tokenstream.h>
#include    <antlr3baserecognizer.h>

#ifdef __cplusplus
extern "C" {
#endif

typedef	struct ANTLR3_LEXER_struct
{
    /** If there is a super structure that is implementing the
     *  lexer, then a pointer to it can be stored here in case
     *  implementing functions are overridden by this super structure.
     */
    void	* super;

    /** A generated lexer has an mTokens() function, which needs
     *  the context pointer of the generated lexer, not the base lexer interface
     *  this is stored here and initialized by the generated code (or manually
     *  if this is a manually built lexer.
     */
    void	* ctx;

    /** A pointer to the character stream whence this lexer is receiving
     *  characters. 
     *  TODO: I may come back to this and implement charstream outside
     *  the input stream as per the java implementation.
     */
    pANTLR3_INPUT_STREAM	input;

    /** Pointer to the implementation of a base recognizer, which the lexer
     *  creates and then overrides with its own lexer oriented functions (the 
     *  default implementation is parser oriented). This also contains a
     *  token source interface, which the lexer instance will provide to anything 
     *  that needs it, which is anything else that implements a base recognizer,
     *  such as a parser.
     */
    pANTLR3_BASE_RECOGNIZER	rec;

    /** Pointer to a function that sets the charstream source for the lexer and
     *  causes it to  be reset.
     */
    void			(*setCharStream)    (struct ANTLR3_LEXER_struct * lexer, pANTLR3_INPUT_STREAM input);
    
    /** Pointer to a function that switches the current character input stream to 
     *  a new one, saving the old one, which we will revert to at the end of this 
     *  new one.
     */
    void			(*pushCharStream)   (struct ANTLR3_LEXER_struct * lexer, pANTLR3_INPUT_STREAM input);

    /** Pointer to a function that abandons the current input stream, whether it
     *  is empty or not and reverts to the previous stacked input stream.
     */
    void			(*popCharStream)    (struct ANTLR3_LEXER_struct * lexer);

    /** Pointer to a function that emits the supplied token as the next token in
     *  the stream.
     */
    void			(*emitNew)	    (struct ANTLR3_LEXER_struct * lexer, pANTLR3_COMMON_TOKEN token);

    /** Pointer to a function that constructs a new token from the lexer stored information 
     */
    pANTLR3_COMMON_TOKEN	(*emit)		    (struct ANTLR3_LEXER_struct * lexer);

    /** Pointer to the user provided (either manually or through code generation
     *  function that causes the lexer rules to run the lexing rules and produce 
     *  the next token if there iss one. This is called from nextToken() in the
     *  pANTLR3_TOKEN_SOURCE. Note that the input parameter for this funciton is 
     *  the generated lexer context (stored in ctx in this interface) it is a generated
     *  function and expects the context to be the generated lexer. 
     */
    void	        (*mTokens)		    (void * ctx);

    /** Pointer to a function that attempts to match and consume the specified string from the input
     *  stream. Note that strings muse be passed as terminated arrays of ANTLR3_UCHAR. Strings are terminated
     *  with 0xFFFFFFFF, which is an invalid UTF32 character
     */
    ANTLR3_BOOLEAN	(*matchs)	    (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR * string);

    /** Pointer to a function that matches and consumes the specified character from the input stream.
     *  The input stream is required to provide characters via LA() as UTF32 characters. The default lexer
     *  implementation is source encoding agnostic and so input streams do not generally need to 
     *  override the default implmentation.
     */
    ANTLR3_BOOLEAN	(*matchc)	    (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR c);

    /** Pointer to a function that matches any character in the supplied range (I suppose it could be a token range too
     *  but this would only be useful if the tokens were in tsome guaranteed order which is
     *  only going to happen with a hand crafted token set).
     */
    ANTLR3_BOOLEAN	(*matchRange)	    (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR low, ANTLR3_UCHAR high);

    /** Pointer to a function that matches the next token/char in the input stream
     *  regardless of what it actaully is.
     */
    void		(*matchAny)	    (struct ANTLR3_LEXER_struct * lexer);

    /** Pointer to a function that recovers from an error found in the input stream.
     *  Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also
     *  be from a mismatched token that the (*match)() could not recover from.
     */
    void		(*recover)	    (struct ANTLR3_LEXER_struct * lexer);

    /** Pointer to function to return the current line number in the input stream
     */
    ANTLR3_UINT32	(*getLine)		(struct ANTLR3_LEXER_struct * lexer);
    ANTLR3_MARKER	(*getCharIndex)		(struct ANTLR3_LEXER_struct * lexer);
    ANTLR3_UINT32	(*getCharPositionInLine)(struct ANTLR3_LEXER_struct * lexer);

    /** Pointer to function to return the text so far for the current token being generated
     */
    pANTLR3_STRING	(*getText)	    (struct ANTLR3_LEXER_struct * lexer);


    /** Pointer to a function that knows how to free the resources of a lexer
     */
    void		(*free)		    (struct ANTLR3_LEXER_struct * lexer);

}
    ANTLR3_LEXER;

#ifdef __cplusplus
}
#endif

#endif