I used antlr4 to generate a python target using the Python3.g4 grammar file from the antlr grammars repo. The generated Python3Lexer.py file contained Java code which I needed to translate to python. Here are the two java segments it outputted, you can find them both inside the python3 grammar file here also
// A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
private java.util.LinkedList<Token> tokens = new java.util.LinkedList<>();
// The stack that keeps track of the indentation level.
private java.util.Stack<Integer> indents = new java.util.Stack<>();
// The amount of opened braces, brackets and parenthesis.
private int opened = 0;
// The most recently produced token.
private Token lastToken = null;
@Override
public void emit(Token t) {
super.setToken(t);
tokens.offer(t);
}
@Override
public Token nextToken() {
// Check if the end-of-file is ahead and there are still some DEDENTS expected.
if (_input.LA(1) == EOF && !this.indents.isEmpty()) {
// Remove any trailing EOF tokens from our buffer.
for (int i = tokens.size() - 1; i >= 0; i--) {
if (tokens.get(i).getType() == EOF) {
tokens.remove(i);
}
}
// First emit an extra line break that serves as the end of the statement.
this.emit(commonToken(Python3Parser.NEWLINE, "\n"));
// Now emit as much DEDENT tokens as needed.
while (!indents.isEmpty()) {
this.emit(createDedent());
indents.pop();
}
// Put the EOF back on the token stream.
this.emit(commonToken(Python3Parser.EOF, "<EOF>"));
}
Token next = super.nextToken();
if (next.getChannel() == Token.DEFAULT_CHANNEL) {
// Keep track of the last token on the default channel.
this.lastToken = next;
}
return tokens.isEmpty() ? next : tokens.poll();
}
private Token createDedent() {
CommonToken dedent = commonToken(Python3Parser.DEDENT, "");
dedent.setLine(this.lastToken.getLine());
return dedent;
}
private CommonToken commonToken(int type, String text) {
int stop = this.getCharIndex() - 1;
int start = text.isEmpty() ? stop : stop - text.length() + 1;
return new CommonToken(this._tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, start, stop);
}
static int getIndentationCount(String spaces) {
int count = 0;
for (char ch : spaces.toCharArray()) {
switch (ch) {
case '\t':
count += 8 - (count % 8);
break;
default:
// A normal space char.
count++;
}
}
return count;
}
boolean atStartOfInput() {
return super.getCharPositionInLine() == 0 && super.getLine() == 1;
}
and
String newLine = getText().replaceAll("[^\r\n\f]+", "");
String spaces = getText().replaceAll("[\r\n\f]+", "");
int next = _input.LA(1);
if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') {
// If we're inside a list or on a blank line, ignore all indents,
// dedents and line breaks.
skip();
}
else {
emit(commonToken(NEWLINE, newLine));
int indent = getIndentationCount(spaces);
int previous = indents.isEmpty() ? 0 : indents.peek();
if (indent == previous) {
// skip indents of the same size as the present indent-size
skip();
}
else if (indent > previous) {
indents.push(indent);
emit(commonToken(Python3Parser.INDENT, spaces));
}
else {
// Possibly emit more than 1 DEDENT token.
while(!indents.isEmpty() && indents.peek() > indent) {
this.emit(createDedent());
indents.pop();
}
}
}
I translated these myself to:
# A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
tokens = deque()
# The stack that keeps track of the indentation level.
# https://docs.python.org/3/tutorial/datastructures.html#using-lists-as-stacks
indents = []
# The amount of opened braces, brackets and parenthesis.
opened = 0
# The most recently produced token.
lastToken = None
def emit(self, t):
self._token = t
self.tokens.append(t)
def nextToken(self):
# Check if the end-of-file is ahead and there are still some DEDENTS expected.
if self._input.LA(1) == Token.EOF and self.indents.size() != 0:
# Remove any trailing EOF tokens from our buffer.
for i in range(tokens.size() - 1, 0, -1):
if self.tokens[i].getType() == Token.EOF:
self.tokens.remove(i)
# First emit an extra line break that serves as the end of the statement.
self.emit(commonToken(Python3Parser.NEWLINE, "\n"))
# Now emit as much DEDENT tokens as needed.
while self.indents.size() != 0:
self.emit(createDedent())
self.indents.pop()
# Put the EOF back on the token stream.
self.emit(commonToken(Python3Parser.EOF, "<EOF>"))
next = self.nextToken()
if next.getChannel() == Token.DEFAULT_CHANNEL:
# Keep track of the last token on the default channel.
self.lastToken = next
return next if self.tokens.size() == 0 else self.tokens.popleft()
def createDedent():
dedent = commonToken(Python3Parser.DEDENT, "")
dedent.setLine(self.lastToken.getLine())
return dedent
def commonToken(self, type, text):
stop = self.getCharIndex() - 1
start = stop if text.size() == 0 else stop - text.size() + 1
return CommonToken(self._tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, start, stop)
def getIndentationCount(spaces):
count = 0
for ch in spaces:
if ch == '\t':
count += 8 - (count % 8)
break
else:
# A normal space char.
count = count + 1
return count
def atStartOfInput(self):
return self.getCharPositionInLine() == 0 and self.getLine() == 1
and
newLine = getText().replaceAll("[^\r\n\f]+", "")
spaces = getText().replaceAll("[\r\n\f]+", "")
next = self._input.LA(1)
if opened > 0 or next == '\r' or next == '\n' or next == '\f' or next == '#':
# If we're inside a list or on a blank line, ignore all indents,
# dedents and line breaks.
skip()
else:
emit(commonToken(NEWLINE, newLine))
indent = getIndentationCount(spaces)
previous = 0 if indents.isEmpty() else indents.peek()
if indent == previous:
# skip indents of the same size as the present indent-size
skip()
elif indent > previous:
indents.push(indent)
emit(commonToken(Python3Parser.INDENT, spaces))
else:
# Possibly emit more than 1 DEDENT token.
while not indents.isEmpty() and indents.peek() > indent:
self.emit(createDedent())
indents.pop()
and this is my python script to run the antlr output with the python inside instead of the java snippets. Ran with the command python main.py test.py
import sys
from antlr4 import *
from Python3Lexer import Python3Lexer
from Python3Parser import Python3Parser
from Python3Listener import Python3Listener
class FuncPrinter(Python3Listener):
def enterFuncdef(self, ctx):
print("Oh, a func")
def main(argv):
input = FileStream(argv[1])
lexer = Python3Lexer(input)
stream = CommonTokenStream(lexer)
parser = Python3Parser(stream)
tree = parser.funcdef()
printer = KeyPrinter()
walker = ParseTreeWalker()
walker.walk(printer, tree)
if __name__ == '__main__':
main(sys.argv)
It errors and print the following trace
Traceback (most recent call last):
File "main.py", line 24, in <module>
main(sys.argv)
File "main.py", line 17, in main
tree = parser.parameters()
File "...\antler-test\Python3Parser.py", line 1297, in parameters
self.enterRule(localctx, 14, self.RULE_parameters)
File "...\antler-test\antlr4\Parser.py", line 358, in enterRule
self._ctx.start = self._input.LT(1)
File "...\antler-test\antlr4\CommonTokenStream.py", line 61, in LT
self.lazyInit()
File "...\antler-test\antlr4\BufferedTokenStream.py", line 186, in lazyInit
self.setup()
File "...\antler-test\antlr4\BufferedTokenStream.py", line 189, in setup
self.sync(0)
File "...\antler-test\antlr4\BufferedTokenStream.py", line 111, in sync
fetched = self.fetch(n)
File "...\antler-test\antlr4\BufferedTokenStream.py", line 123, in fetch
t = self.tokenSource.nextToken()
File "...\antler-test\Python3Lexer.py", line 698, in nextToken
next = self.nextToken()
File "...\antler-test\Python3Lexer.py", line 698, in nextToken
next = self.nextToken()
File "...\antler-test\Python3Lexer.py", line 698, in nextToken
next = self.nextToken()
[Previous line repeated 985 more times]
File "...\antler-test\Python3Lexer.py", line 680, in nextToken
if self._input.LA(1) == Token.EOF and self.indents.size() != 0:
File "...\antler-test\antlr4\InputStream.py", line 49, in LA
if offset==0:
RecursionError: maximum recursion depth exceeded in comparison
the input file looks like:
def fun1():
return None
def fun2():
return None
I'm not sure if I translated the python incorrectly or the recursive algorithm is simply too much for python, but I also can't figure out how to change the algorithm for the nextToken method to be iterative since it is not tail recursive. Maybe someone could figure that out? Or is there some other problem with what I'm doing?