4

I used antlr4 to generate a python target using the Python3.g4 grammar file from the antlr grammars repo. The generated Python3Lexer.py file contained Java code which I needed to translate to python. Here are the two java segments it outputted, you can find them both inside the python3 grammar file here also

// A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
private java.util.LinkedList<Token> tokens = new java.util.LinkedList<>();

// The stack that keeps track of the indentation level.
private java.util.Stack<Integer> indents = new java.util.Stack<>();

// The amount of opened braces, brackets and parenthesis.
private int opened = 0;

// The most recently produced token.
private Token lastToken = null;

@Override
public void emit(Token t) {
    super.setToken(t);
    tokens.offer(t);
}

@Override
public Token nextToken() {

    // Check if the end-of-file is ahead and there are still some DEDENTS expected.
    if (_input.LA(1) == EOF && !this.indents.isEmpty()) {

    // Remove any trailing EOF tokens from our buffer.
    for (int i = tokens.size() - 1; i >= 0; i--) {
        if (tokens.get(i).getType() == EOF) {
          tokens.remove(i);
        }
    }

    // First emit an extra line break that serves as the end of the statement.
    this.emit(commonToken(Python3Parser.NEWLINE, "\n"));

    // Now emit as much DEDENT tokens as needed.
    while (!indents.isEmpty()) {
        this.emit(createDedent());
        indents.pop();
    }

    // Put the EOF back on the token stream.
    this.emit(commonToken(Python3Parser.EOF, "<EOF>"));
  }

  Token next = super.nextToken();

  if (next.getChannel() == Token.DEFAULT_CHANNEL) {
      // Keep track of the last token on the default channel.
      this.lastToken = next;
  }

  return tokens.isEmpty() ? next : tokens.poll();
}

private Token createDedent() {
    CommonToken dedent = commonToken(Python3Parser.DEDENT, "");
    dedent.setLine(this.lastToken.getLine());
    return dedent;
}

private CommonToken commonToken(int type, String text) {
    int stop = this.getCharIndex() - 1;
    int start = text.isEmpty() ? stop : stop - text.length() + 1;
    return new CommonToken(this._tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, start, stop);
}

static int getIndentationCount(String spaces) {

    int count = 0;

    for (char ch : spaces.toCharArray()) {
      switch (ch) {
        case '\t':
          count += 8 - (count % 8);
          break;
        default:
          // A normal space char.
          count++;
      }
    }

    return count;
}

boolean atStartOfInput() {
    return super.getCharPositionInLine() == 0 && super.getLine() == 1;
}

and

String newLine = getText().replaceAll("[^\r\n\f]+", "");
String spaces = getText().replaceAll("[\r\n\f]+", "");
int next = _input.LA(1);

if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') {
   // If we're inside a list or on a blank line, ignore all indents,
   // dedents and line breaks.
   skip();
}
else {
   emit(commonToken(NEWLINE, newLine));

   int indent = getIndentationCount(spaces);
   int previous = indents.isEmpty() ? 0 : indents.peek();

   if (indent == previous) {
       // skip indents of the same size as the present indent-size
       skip();
   }
   else if (indent > previous) {
       indents.push(indent);
       emit(commonToken(Python3Parser.INDENT, spaces));
   }
   else {
       // Possibly emit more than 1 DEDENT token.
       while(!indents.isEmpty() && indents.peek() > indent) {
           this.emit(createDedent());
           indents.pop();
       }
   }
}

I translated these myself to:

# A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
tokens = deque()

# The stack that keeps track of the indentation level.
# https://docs.python.org/3/tutorial/datastructures.html#using-lists-as-stacks
indents = []

# The amount of opened braces, brackets and parenthesis.
opened = 0

# The most recently produced token.
lastToken = None

def emit(self, t):
  self._token = t
  self.tokens.append(t)

def nextToken(self):

  # Check if the end-of-file is ahead and there are still some DEDENTS expected.
  if self._input.LA(1) == Token.EOF and self.indents.size() != 0:

    # Remove any trailing EOF tokens from our buffer.
    for i in range(tokens.size() - 1, 0, -1):
      if self.tokens[i].getType() == Token.EOF:
        self.tokens.remove(i)

    # First emit an extra line break that serves as the end of the statement.
    self.emit(commonToken(Python3Parser.NEWLINE, "\n"))

    # Now emit as much DEDENT tokens as needed.
    while self.indents.size() != 0:
      self.emit(createDedent())
      self.indents.pop()

    # Put the EOF back on the token stream.
    self.emit(commonToken(Python3Parser.EOF, "<EOF>"))

  next = self.nextToken()

  if next.getChannel() == Token.DEFAULT_CHANNEL:
    # Keep track of the last token on the default channel.
    self.lastToken = next

  return next if self.tokens.size() == 0 else self.tokens.popleft()

def createDedent():
  dedent = commonToken(Python3Parser.DEDENT, "")
  dedent.setLine(self.lastToken.getLine())
  return dedent

def commonToken(self, type, text):
  stop = self.getCharIndex() - 1
  start = stop if text.size() == 0 else stop - text.size() + 1
  return CommonToken(self._tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, start, stop)

def getIndentationCount(spaces):

  count = 0

  for ch in spaces:
    if ch == '\t':
        count += 8 - (count % 8)
        break
    else:
        # A normal space char.
        count = count + 1

  return count

def atStartOfInput(self):
  return self.getCharPositionInLine() == 0 and self.getLine() == 1

and

newLine = getText().replaceAll("[^\r\n\f]+", "")
spaces = getText().replaceAll("[\r\n\f]+", "")
next = self._input.LA(1)

if opened > 0 or next == '\r' or next == '\n' or next == '\f' or next == '#':
    # If we're inside a list or on a blank line, ignore all indents,
    # dedents and line breaks.
    skip()
else:
    emit(commonToken(NEWLINE, newLine))

indent = getIndentationCount(spaces)
previous = 0 if indents.isEmpty() else indents.peek()

if indent == previous:
    # skip indents of the same size as the present indent-size
    skip()
elif indent > previous:
    indents.push(indent)
    emit(commonToken(Python3Parser.INDENT, spaces))
else:
     # Possibly emit more than 1 DEDENT token.
     while not indents.isEmpty() and indents.peek() > indent:
     self.emit(createDedent())
     indents.pop()

and this is my python script to run the antlr output with the python inside instead of the java snippets. Ran with the command python main.py test.py

import sys
from antlr4 import *
from Python3Lexer import Python3Lexer
from Python3Parser import Python3Parser
from Python3Listener import Python3Listener

class FuncPrinter(Python3Listener):
  def enterFuncdef(self, ctx):
    print("Oh, a func")

def main(argv):
  input = FileStream(argv[1])
  lexer = Python3Lexer(input)
  stream = CommonTokenStream(lexer)
  parser = Python3Parser(stream)
  tree = parser.funcdef()

  printer = KeyPrinter()
  walker = ParseTreeWalker()
  walker.walk(printer, tree)

if __name__ == '__main__':
  main(sys.argv)

It errors and print the following trace

Traceback (most recent call last):
  File "main.py", line 24, in <module>
    main(sys.argv)
  File "main.py", line 17, in main
    tree = parser.parameters()
  File "...\antler-test\Python3Parser.py", line 1297, in parameters
    self.enterRule(localctx, 14, self.RULE_parameters)
  File "...\antler-test\antlr4\Parser.py", line 358, in enterRule
    self._ctx.start = self._input.LT(1)
  File "...\antler-test\antlr4\CommonTokenStream.py", line 61, in LT
    self.lazyInit()
  File "...\antler-test\antlr4\BufferedTokenStream.py", line 186, in lazyInit
    self.setup()
  File "...\antler-test\antlr4\BufferedTokenStream.py", line 189, in setup
    self.sync(0)
  File "...\antler-test\antlr4\BufferedTokenStream.py", line 111, in sync
    fetched = self.fetch(n)
  File "...\antler-test\antlr4\BufferedTokenStream.py", line 123, in fetch
    t = self.tokenSource.nextToken()
  File "...\antler-test\Python3Lexer.py", line 698, in nextToken
    next = self.nextToken()
  File "...\antler-test\Python3Lexer.py", line 698, in nextToken
    next = self.nextToken()
  File "...\antler-test\Python3Lexer.py", line 698, in nextToken
    next = self.nextToken()
  [Previous line repeated 985 more times]
  File "...\antler-test\Python3Lexer.py", line 680, in nextToken
    if self._input.LA(1) == Token.EOF and self.indents.size() != 0:
  File "...\antler-test\antlr4\InputStream.py", line 49, in LA
    if offset==0:
RecursionError: maximum recursion depth exceeded in comparison

the input file looks like:

def fun1():
    return None

def fun2():
    return None

I'm not sure if I translated the python incorrectly or the recursive algorithm is simply too much for python, but I also can't figure out how to change the algorithm for the nextToken method to be iterative since it is not tail recursive. Maybe someone could figure that out? Or is there some other problem with what I'm doing?

0

3 Answers 3

3

I'm working exactly on the same topic from a couple of days.

This is not as easy. The Python runtime has not exactly the same API than the java one. Python runtime is less used and quite incomplete.

I had to use some workarounds, but it seems to work. Here is my code :

tokens { INDENT, DEDENT }

@lexer::members {

    # A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
    self.tokens = []

    # The stack that keeps track of the indentation level.
    self.indents = []

    # The amount of opened braces, brackets and parenthesis.
    self.opened = 0

    # The most recently produced token.
    self.last_token = None

def emitToken(self, t):
    super().emitToken(t)
    self.tokens.append(t)

def nextToken(self):
    if self._input.LA(1) == Token.EOF and len(self.indents) > 0:
        # Remove any trailing EOF tokens from our buffer.
        while len(self.tokens) > 0 and self.tokens[-1].type == Token.EOF:
            del self.tokens[-1]

        # First emit an extra line break that serves as the end of the statement.
        self.emitToken(self.common_token(Python3Lexer.NEWLINE, "\n"));

        # Now emit as much DEDENT tokens as needed.
        while len(self.indents) != 0:
            self.emitToken(self.create_dedent())
            del self.indents[-1]

        # Put the EOF back on the token stream.
        self.emitToken(self.common_token(Token.EOF, "<EOF>"));

    next = super().nextToken();

    if next.channel == Token.DEFAULT_CHANNEL:
        # Keep track of the last token on the default channel.
        self.last_token = next

    if len(self.tokens) == 0:
        return next
    else:
        t = self.tokens[0]
        del self.tokens[0]
        return t

def create_dedent(self):
    from Python3Parser import Python3Parser
    dedent = self.common_token(Python3Parser.DEDENT, "")
    dedent.line = self.last_token.line
    return dedent

def common_token(self, _type,  text):
    from antlr4.Token import CommonToken
    stop = self.getCharIndex() - 1
    if len(self.text) == 0:
        start = stop
    else:
        start = stop - len(self.text) + 1
    return CommonToken(self._tokenFactorySourcePair, _type, Lexer.DEFAULT_TOKEN_CHANNEL, start, stop)

## Calculates the indentation of the provided spaces, taking the
## following rules into account:
##
## "Tabs are replaced (from left to right) by one to eight spaces
##  such that the total number of characters up to and including
##  the replacement is a multiple of eight [...]"
##
##  -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
def getIndentationCount(self, spaces):
    count = 0
    for ch in spaces:
        if ch == '\t':
            count += 8 - (count % 8)
        else:
            count += 1
    return count

def atStartOfInput(self):
    return self._interp.column == 0 and self._interp.line == 1

}

And for the NEWLINE lexer part :

NEWLINE
 : ( {self.atStartOfInput()}?   SPACES
   | ( '\r'? '\n' | '\r' | '\f' ) SPACES?
   )

   {
    import re
    from Python3Parser import Python3Parser
    new_line = re.sub(r"[^\r\n\f]+", "", self._interp.getText(self._input)) #.replaceAll("[^\r\n\f]+", "")
    spaces = re.sub(r"[\r\n\f]+", "", self._interp.getText(self._input)) #.replaceAll("[\r\n\f]+", "")
    next = self._input.LA(1)

    if self.opened > 0 or next == '\r' or next == '\n' or next == '\f' or next == '#':
        self.skip()
    else:
        self.emitToken(self.common_token(self.NEWLINE, new_line))

        indent = self.getIndentationCount(spaces)
        if len(self.indents) == 0:
            previous = 0
        else:
            previous = self.indents[-1]

        if indent == previous:
            self.skip()
        elif indent > previous:
            self.indents.append(indent)
            self.emitToken(self.common_token(Python3Parser.INDENT, spaces))
        else:
            while len(self.indents) > 0 and self.indents[-1] > indent:
                self.emitToken(self.create_dedent())
                del self.indents[-1]

   };

You also have to replace in the whole file the lexer id "str" by "string" (for instance) because str is a keyword in python.

Sign up to request clarification or add additional context in comments.

Comments

2

Your python code says

next = self.nextToken()

But your java code says:

Token next = super.nextToken();

Note that super is not the same as self. You probably mean something like:

next = super().nextToken()

Comments

1

I had the same problem. I couldn't quite get Alexandre's code working under python3. I had to modify it a bit:

...
next = self._input.LA(1)
if next == Python3Parser.EOF:
    chr_next = -1
else:
    chr_next = chr( next )

if self.opened > 0 or chr_next == '\r' or chr_next == '\n' or chr_next == '\f' or chr_next == '#':
    self.skip()
...

You can also move all of the imports into the lexer's header:

@lexer::header {
import re
from Python3Parser import Python3Parser
from antlr4.Token import CommonToken    
}

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.