2017-04-25 69 views
2

我使用antlr4從antlr語法回購使用Python3.g4語法文件生成python目標。生成的Python3Lexer.py文件包含我需要轉換爲python的Java代碼。下面是它輸出的兩個Java段,你可以找到他們倆的python3語法文件中here also將java翻譯爲antlr4中的python python3目標

// A queue where extra tokens are pushed on (see the NEWLINE lexer rule). 
private java.util.LinkedList<Token> tokens = new java.util.LinkedList<>(); 

// The stack that keeps track of the indentation level. 
private java.util.Stack<Integer> indents = new java.util.Stack<>(); 

// The amount of opened braces, brackets and parenthesis. 
private int opened = 0; 

// The most recently produced token. 
private Token lastToken = null; 

@Override 
public void emit(Token t) { 
    super.setToken(t); 
    tokens.offer(t); 
} 

@Override 
public Token nextToken() { 

    // Check if the end-of-file is ahead and there are still some DEDENTS expected. 
    if (_input.LA(1) == EOF && !this.indents.isEmpty()) { 

    // Remove any trailing EOF tokens from our buffer. 
    for (int i = tokens.size() - 1; i >= 0; i--) { 
     if (tokens.get(i).getType() == EOF) { 
      tokens.remove(i); 
     } 
    } 

    // First emit an extra line break that serves as the end of the statement. 
    this.emit(commonToken(Python3Parser.NEWLINE, "\n")); 

    // Now emit as much DEDENT tokens as needed. 
    while (!indents.isEmpty()) { 
     this.emit(createDedent()); 
     indents.pop(); 
    } 

    // Put the EOF back on the token stream. 
    this.emit(commonToken(Python3Parser.EOF, "<EOF>")); 
    } 

    Token next = super.nextToken(); 

    if (next.getChannel() == Token.DEFAULT_CHANNEL) { 
     // Keep track of the last token on the default channel. 
     this.lastToken = next; 
    } 

    return tokens.isEmpty() ? next : tokens.poll(); 
} 

private Token createDedent() { 
    CommonToken dedent = commonToken(Python3Parser.DEDENT, ""); 
    dedent.setLine(this.lastToken.getLine()); 
    return dedent; 
} 

private CommonToken commonToken(int type, String text) { 
    int stop = this.getCharIndex() - 1; 
    int start = text.isEmpty() ? stop : stop - text.length() + 1; 
    return new CommonToken(this._tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, start, stop); 
} 

static int getIndentationCount(String spaces) { 

    int count = 0; 

    for (char ch : spaces.toCharArray()) { 
     switch (ch) { 
     case '\t': 
      count += 8 - (count % 8); 
      break; 
     default: 
      // A normal space char. 
      count++; 
     } 
    } 

    return count; 
} 

boolean atStartOfInput() { 
    return super.getCharPositionInLine() == 0 && super.getLine() == 1; 
} 

String newLine = getText().replaceAll("[^\r\n\f]+", ""); 
String spaces = getText().replaceAll("[\r\n\f]+", ""); 
int next = _input.LA(1); 

if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') { 
    // If we're inside a list or on a blank line, ignore all indents, 
    // dedents and line breaks. 
    skip(); 
} 
else { 
    emit(commonToken(NEWLINE, newLine)); 

    int indent = getIndentationCount(spaces); 
    int previous = indents.isEmpty() ? 0 : indents.peek(); 

    if (indent == previous) { 
     // skip indents of the same size as the present indent-size 
     skip(); 
    } 
    else if (indent > previous) { 
     indents.push(indent); 
     emit(commonToken(Python3Parser.INDENT, spaces)); 
    } 
    else { 
     // Possibly emit more than 1 DEDENT token. 
     while(!indents.isEmpty() && indents.peek() > indent) { 
      this.emit(createDedent()); 
      indents.pop(); 
     } 
    } 
} 

我翻譯這些自己:

# A queue where extra tokens are pushed on (see the NEWLINE lexer rule). 
tokens = deque() 

# The stack that keeps track of the indentation level. 
# https://docs.python.org/3/tutorial/datastructures.html#using-lists-as-stacks 
indents = [] 

# The amount of opened braces, brackets and parenthesis. 
opened = 0 

# The most recently produced token. 
lastToken = None 

def emit(self, t): 
    self._token = t 
    self.tokens.append(t) 

def nextToken(self): 

    # Check if the end-of-file is ahead and there are still some DEDENTS expected. 
    if self._input.LA(1) == Token.EOF and self.indents.size() != 0: 

    # Remove any trailing EOF tokens from our buffer. 
    for i in range(tokens.size() - 1, 0, -1): 
     if self.tokens[i].getType() == Token.EOF: 
     self.tokens.remove(i) 

    # First emit an extra line break that serves as the end of the statement. 
    self.emit(commonToken(Python3Parser.NEWLINE, "\n")) 

    # Now emit as much DEDENT tokens as needed. 
    while self.indents.size() != 0: 
     self.emit(createDedent()) 
     self.indents.pop() 

    # Put the EOF back on the token stream. 
    self.emit(commonToken(Python3Parser.EOF, "<EOF>")) 

    next = self.nextToken() 

    if next.getChannel() == Token.DEFAULT_CHANNEL: 
    # Keep track of the last token on the default channel. 
    self.lastToken = next 

    return next if self.tokens.size() == 0 else self.tokens.popleft() 

def createDedent(): 
    dedent = commonToken(Python3Parser.DEDENT, "") 
    dedent.setLine(self.lastToken.getLine()) 
    return dedent 

def commonToken(self, type, text): 
    stop = self.getCharIndex() - 1 
    start = stop if text.size() == 0 else stop - text.size() + 1 
    return CommonToken(self._tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, start, stop) 

def getIndentationCount(spaces): 

    count = 0 

    for ch in spaces: 
    if ch == '\t': 
     count += 8 - (count % 8) 
     break 
    else: 
     # A normal space char. 
     count = count + 1 

    return count 

def atStartOfInput(self): 
    return self.getCharPositionInLine() == 0 and self.getLine() == 1 

newLine = getText().replaceAll("[^\r\n\f]+", "") 
spaces = getText().replaceAll("[\r\n\f]+", "") 
next = self._input.LA(1) 

if opened > 0 or next == '\r' or next == '\n' or next == '\f' or next == '#': 
    # If we're inside a list or on a blank line, ignore all indents, 
    # dedents and line breaks. 
    skip() 
else: 
    emit(commonToken(NEWLINE, newLine)) 

indent = getIndentationCount(spaces) 
previous = 0 if indents.isEmpty() else indents.peek() 

if indent == previous: 
    # skip indents of the same size as the present indent-size 
    skip() 
elif indent > previous: 
    indents.push(indent) 
    emit(commonToken(Python3Parser.INDENT, spaces)) 
else: 
    # Possibly emit more than 1 DEDENT token. 
    while not indents.isEmpty() and indents.peek() > indent: 
    self.emit(createDedent()) 
    indents.pop() 

,這是我的python腳本,用python代替java代碼片段來運行antlr輸出。用命令冉python main.py test.py

import sys 
from antlr4 import * 
from Python3Lexer import Python3Lexer 
from Python3Parser import Python3Parser 
from Python3Listener import Python3Listener 

class FuncPrinter(Python3Listener): 
    def enterFuncdef(self, ctx): 
    print("Oh, a func") 

def main(argv): 
    input = FileStream(argv[1]) 
    lexer = Python3Lexer(input) 
    stream = CommonTokenStream(lexer) 
    parser = Python3Parser(stream) 
    tree = parser.funcdef() 

    printer = KeyPrinter() 
    walker = ParseTreeWalker() 
    walker.walk(printer, tree) 

if __name__ == '__main__': 
    main(sys.argv) 

它的錯誤,並打印以下的跟蹤

Traceback (most recent call last): 
    File "main.py", line 24, in <module> 
    main(sys.argv) 
    File "main.py", line 17, in main 
    tree = parser.parameters() 
    File "...\antler-test\Python3Parser.py", line 1297, in parameters 
    self.enterRule(localctx, 14, self.RULE_parameters) 
    File "...\antler-test\antlr4\Parser.py", line 358, in enterRule 
    self._ctx.start = self._input.LT(1) 
    File "...\antler-test\antlr4\CommonTokenStream.py", line 61, in LT 
    self.lazyInit() 
    File "...\antler-test\antlr4\BufferedTokenStream.py", line 186, in lazyInit 
    self.setup() 
    File "...\antler-test\antlr4\BufferedTokenStream.py", line 189, in setup 
    self.sync(0) 
    File "...\antler-test\antlr4\BufferedTokenStream.py", line 111, in sync 
    fetched = self.fetch(n) 
    File "...\antler-test\antlr4\BufferedTokenStream.py", line 123, in fetch 
    t = self.tokenSource.nextToken() 
    File "...\antler-test\Python3Lexer.py", line 698, in nextToken 
    next = self.nextToken() 
    File "...\antler-test\Python3Lexer.py", line 698, in nextToken 
    next = self.nextToken() 
    File "...\antler-test\Python3Lexer.py", line 698, in nextToken 
    next = self.nextToken() 
    [Previous line repeated 985 more times] 
    File "...\antler-test\Python3Lexer.py", line 680, in nextToken 
    if self._input.LA(1) == Token.EOF and self.indents.size() != 0: 
    File "...\antler-test\antlr4\InputStream.py", line 49, in LA 
    if offset==0: 
RecursionError: maximum recursion depth exceeded in comparison 

輸入文件看起來像:

def fun1(): 
    return None 

def fun2(): 
    return None 

我不知道如果我翻譯的錯誤蟒蛇或者遞歸算法對python來說太簡單了,但我也無法弄清楚如何改變nextToken方法的迭代算法,因爲它不是尾遞歸即也許有人可以弄清楚?或者我在做什麼還有其他問題?

回答

2

我在幾天內就完全從事同一主題。

這並不容易。 Python運行時並不完全相同的API。 Python運行時不太常用,也不完整。我不得不使用一些變通辦法,但它似乎工作。這裏是我的代碼:

tokens { INDENT, DEDENT } 

@lexer::members { 

    # A queue where extra tokens are pushed on (see the NEWLINE lexer rule). 
    self.tokens = [] 

    # The stack that keeps track of the indentation level. 
    self.indents = [] 

    # The amount of opened braces, brackets and parenthesis. 
    self.opened = 0 

    # The most recently produced token. 
    self.last_token = None 

def emitToken(self, t): 
    super().emitToken(t) 
    self.tokens.append(t) 

def nextToken(self): 
    if self._input.LA(1) == Token.EOF and len(self.indents) > 0: 
     # Remove any trailing EOF tokens from our buffer. 
     while len(self.tokens) > 0 and self.tokens[-1].type == Token.EOF: 
      del self.tokens[-1] 

     # First emit an extra line break that serves as the end of the statement. 
     self.emitToken(self.common_token(Python3Lexer.NEWLINE, "\n")); 

     # Now emit as much DEDENT tokens as needed. 
     while len(self.indents) != 0: 
      self.emitToken(self.create_dedent()) 
      del self.indents[-1] 

     # Put the EOF back on the token stream. 
     self.emitToken(self.common_token(Token.EOF, "<EOF>")); 

    next = super().nextToken(); 

    if next.channel == Token.DEFAULT_CHANNEL: 
     # Keep track of the last token on the default channel. 
     self.last_token = next 

    if len(self.tokens) == 0: 
     return next 
    else: 
     t = self.tokens[0] 
     del self.tokens[0] 
     return t 

def create_dedent(self): 
    from Python3Parser import Python3Parser 
    dedent = self.common_token(Python3Parser.DEDENT, "") 
    dedent.line = self.last_token.line 
    return dedent 

def common_token(self, _type, text): 
    from antlr4.Token import CommonToken 
    stop = self.getCharIndex() - 1 
    if len(self.text) == 0: 
     start = stop 
    else: 
     start = stop - len(self.text) + 1 
    return CommonToken(self._tokenFactorySourcePair, _type, Lexer.DEFAULT_TOKEN_CHANNEL, start, stop) 

## Calculates the indentation of the provided spaces, taking the 
## following rules into account: 
## 
## "Tabs are replaced (from left to right) by one to eight spaces 
## such that the total number of characters up to and including 
## the replacement is a multiple of eight [...]" 
## 
## -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation 
def getIndentationCount(self, spaces): 
    count = 0 
    for ch in spaces: 
     if ch == '\t': 
      count += 8 - (count % 8) 
     else: 
      count += 1 
    return count 

def atStartOfInput(self): 
    return self._interp.column == 0 and self._interp.line == 1 

} 

而對於NEWLINE詞法部分:

NEWLINE 
: ({self.atStartOfInput()}? SPACES 
    | ('\r'? '\n' | '\r' | '\f') SPACES? 
    ) 

    { 
    import re 
    from Python3Parser import Python3Parser 
    new_line = re.sub(r"[^\r\n\f]+", "", self._interp.getText(self._input)) #.replaceAll("[^\r\n\f]+", "") 
    spaces = re.sub(r"[\r\n\f]+", "", self._interp.getText(self._input)) #.replaceAll("[\r\n\f]+", "") 
    next = self._input.LA(1) 

    if self.opened > 0 or next == '\r' or next == '\n' or next == '\f' or next == '#': 
     self.skip() 
    else: 
     self.emitToken(self.common_token(self.NEWLINE, new_line)) 

     indent = self.getIndentationCount(spaces) 
     if len(self.indents) == 0: 
      previous = 0 
     else: 
      previous = self.indents[-1] 

     if indent == previous: 
      self.skip() 
     elif indent > previous: 
      self.indents.append(indent) 
      self.emitToken(self.common_token(Python3Parser.INDENT, spaces)) 
     else: 
      while len(self.indents) > 0 and self.indents[-1] > indent: 
       self.emitToken(self.create_dedent()) 
       del self.indents[-1] 

    }; 

您還可以通過「串」整個文件中的詞法分析器ID「海峽」,以取代(例如),因爲str是python中的關鍵字。

2

Python代碼說

next = self.nextToken() 

但你的Java代碼說:

Token next = super.nextToken(); 

注意super是不一樣的self.你大概的意思是這樣的:

next = super().nextToken()