'''Tears apart a C file into a stream of tokens'''
import re

# Exception for a string-until-EOF
class StrError(Exception):
  pass

# Exception for a character that isn't supposed to be there.
class BadChar(Exception):
  def __init__(self, char):
    self.char=char
  def __str__(self):
    return 'Character %s' % `self.char`

# Exception for, well, end of file
class EOF(Exception):
  pass

# Token ids
class tokens:
  NOT=1
  NEQ=2
  AT=3
  HASH=4
  DEFCAT=5
  DOLLAR=6
  MOD=7
  MODEQ=8
  XOR=9
  XOREQ=10
  AMP=11
  LAND=12
  AMPEQ=13
  AST=14
  MULEQ=15
  LPAR=16
  RPAR=17
  MINUS=18
  DEC=19
  SUBEQ=20
  IMEM=21
  PLUS=22
  INC=23
  ADDEQ=24
  ASGN=25
  EQU=26
  LBRC=27
  RBRC=28
  BOR=29
  LOR=30
  OREQ=31
  LBRA=32
  RBRA=33
  SEMI=34
  COLON=35
  SCOPE=36
  COMMA=37
  MEM=38
  LT=39
  LEQ=40
  SHL=41
  SHLEQ=42
  GT=43
  GEQ=44
  SHR=45
  SHREQ=46
  DIV=47
  BCOM=48
  LCOM=49
  COND=50
  BSLASH=51
  BTICK=52
  BNOT=53
  IDEN=100
  NUM=101
  CSTR=102
  SSTR=103
  WS=200
  NL=201

# Symbol-to-tokenid mapping
symbols={
  '!':tokens.NOT,
  '!=':tokens.NEQ,
  '@':tokens.AT,
  '#':tokens.HASH,
  '##':tokens.DEFCAT,
  '$':tokens.DOLLAR,
  '%':tokens.MOD,
  '%=':tokens.MODEQ,
  '^':tokens.XOR,
  '^=':tokens.XOREQ,
  '&':tokens.AMP,
  '&&':tokens.LAND,
  '&=':tokens.AMPEQ,
  '*':tokens.AST,
  '*=':tokens.MULEQ,
  '(':tokens.LPAR,
  ')':tokens.RPAR,
  '-':tokens.MINUS,
  '--':tokens.DEC,
  '-=':tokens.SUBEQ,
  '->':tokens.IMEM,
  '+':tokens.PLUS,
  '++':tokens.INC,
  '+=':tokens.ADDEQ,
  '=':tokens.ASGN,
  '==':tokens.EQU,
  '{':tokens.LBRC,
  '}':tokens.RBRC,
  '|':tokens.BOR,
  '||':tokens.LOR,
  '|=':tokens.OREQ,
  '[':tokens.LBRA,
  ']':tokens.RBRA,
  ';':tokens.SEMI,
  ':':tokens.COLON,
  '::':tokens.SCOPE,
  ',':tokens.COMMA,
  '.':tokens.MEM,
  '<':tokens.LT,
  '<=':tokens.LEQ,
  '<<':tokens.SHL,
  '<<=':tokens.SHLEQ,
  '>':tokens.GT,
  '>=':tokens.GEQ,
  '>>':tokens.SHR,
  '>>=':tokens.SHREQ,
  '/':tokens.DIV,
  '/*':tokens.BCOM,
  '//':tokens.LCOM,
  '?':tokens.COND,
  '\\':tokens.BSLASH,
  '`':tokens.BTICK,
  '~':tokens.BNOT,
  '\'':tokens.CSTR,
  '"':tokens.SSTR
}

# Comment tokens
comments=[tokens.BCOM, tokens.LCOM]

# Handlers for each starting character
handlers={
  '\t':'HandleWhitespace',
  '\n':'HandleNewline',
  '\r':'HandleWhitespace',
  ' ':'HandleWhitespace',
  '!':'HandleSymbol',
  '"':'HandleString',
  '#':'HandleSymbol',
  '$':'HandleSymbol',
  '%':'HandleSymbol',
  '&':'HandleSymbol',
  '\'':'HandleString',
  '(':'HandleSymbol',
  ')':'HandleSymbol',
  '*':'HandleSymbol',
  '+':'HandleSymbol',
  ',':'HandleSymbol',
  '-':'HandleSymbol',
  '.':'HandleSymbol',
  '/':'HandleSymbol',
  '0':'HandleNumber',
  '1':'HandleNumber',
  '2':'HandleNumber',
  '3':'HandleNumber',
  '4':'HandleNumber',
  '5':'HandleNumber',
  '6':'HandleNumber',
  '7':'HandleNumber',
  '8':'HandleNumber',
  '9':'HandleNumber',
  ':':'HandleSymbol',
  ';':'HandleSymbol',
  '<':'HandleSymbol',
  '=':'HandleSymbol',
  '>':'HandleSymbol',
  '?':'HandleSymbol',
  '@':'HandleSymbol',
  'A':'HandleIdentifier',
  'B':'HandleIdentifier',
  'C':'HandleIdentifier',
  'D':'HandleIdentifier',
  'E':'HandleIdentifier',
  'F':'HandleIdentifier',
  'G':'HandleIdentifier',
  'H':'HandleIdentifier',
  'I':'HandleIdentifier',
  'J':'HandleIdentifier',
  'K':'HandleIdentifier',
  'L':'HandleIdentifier',
  'M':'HandleIdentifier',
  'N':'HandleIdentifier',
  'O':'HandleIdentifier',
  'P':'HandleIdentifier',
  'Q':'HandleIdentifier',
  'R':'HandleIdentifier',
  'S':'HandleIdentifier',
  'T':'HandleIdentifier',
  'U':'HandleIdentifier',
  'V':'HandleIdentifier',
  'W':'HandleIdentifier',
  'X':'HandleIdentifier',
  'Y':'HandleIdentifier',
  'Z':'HandleIdentifier',
  '[':'HandleSymbol',
  '\\':'HandleBackslash',
  ']':'HandleSymbol',
  '^':'HandleSymbol',
  '_':'HandleIdentifier',
  '`':'HandleSymbol',
  'a':'HandleIdentifier',
  'b':'HandleIdentifier',
  'c':'HandleIdentifier',
  'd':'HandleIdentifier',
  'e':'HandleIdentifier',
  'f':'HandleIdentifier',
  'g':'HandleIdentifier',
  'h':'HandleIdentifier',
  'i':'HandleIdentifier',
  'j':'HandleIdentifier',
  'k':'HandleIdentifier',
  'l':'HandleIdentifier',
  'm':'HandleIdentifier',
  'n':'HandleIdentifier',
  'o':'HandleIdentifier',
  'p':'HandleIdentifier',
  'q':'HandleIdentifier',
  'r':'HandleIdentifier',
  's':'HandleIdentifier',
  't':'HandleIdentifier',
  'u':'HandleIdentifier',
  'v':'HandleIdentifier',
  'w':'HandleIdentifier',
  'x':'HandleIdentifier',
  'y':'HandleIdentifier',
  'z':'HandleIdentifier',
  '{':'HandleSymbol',
  '|':'HandleSymbol',
  '}':'HandleSymbol',
  '~':'HandleSymbol'
}

# The lexer class
class Clex:
  '''This class lexes C files'''
  def __init__(self, file):
    '''Usage: Clex(file)
file is the C file or file-like object that you want to lex'''
    self.file=file
    self.tokens=tokens()
    self.symbols=symbols
    self.comments=comments
    self.handlers=handlers
    self.identre=re.compile('^[_a-z][_a-z0-9]*$', re.IGNORECASE)
    self.numre=re.compile('^[0-9]+(\.?[0-9]*([e][0-9]+)?f?|l)?$', re.IGNORECASE)
    self.wsre=re.compile('^[ \r\t]+$')
    self.buffer=list(self.file.read(16384))
    self.ahead=0

  def getc(self):
    '''Internal function: Gets a character from the internal buffer'''
    if self.ahead==len(self.buffer):
      self.buffer=list(self.file.read(16384))
      self.ahead=0
      if self.buffer==[]:
        raise EOF
    c=self.buffer[self.ahead]
    self.ahead=self.ahead+1
    return c

  def ungetc(self, char):
    '''Internal function: Restores a character to the internal buffer'''
    if self.ahead==0:
      self.buffer=[char]+self.buffer
    else:
      self.ahead=self.ahead-1
      if type(char)==type(''):
        self.buffer[self.ahead]=char[0]

  def HandleBackslash(self, char):
    '''Internal function: Handles cases where '\\' is seen'''
    try:
      c=self.getc()
      if c=='\r':
        c=self.getc()
      if c=='\n':
        return None
      else:
        self.ungetc(c)
        return (self.symbols['\\'], None)
    except EOF:
      return self.symbols['\\']

  def HandleSymbol(self, char):
    '''Internal function: Handles cases where a symbol is seen'''
    s=char
    while self.symbols.has_key(s):
      try:
        c=self.getc()
        s=s+c
      except EOF:
        return (self.symbols[s], s)
    self.ungetc(c)
    return (self.symbols[s[:-1]], s[:-1])

  def HandleIdentifier(self, char):
    '''Internal function: Handles cases where a letter is seen'''
    s=char
    while self.identre.match(s):
      try:
        c=self.getc()
        s=s+c
      except EOF:
        return (self.tokens.IDEN, s)
    self.ungetc(c)
    return (self.tokens.IDEN, s[:-1])

  def HandleNumber(self, char):
    '''Internal function: Handles cases where a numeral is seen'''
    s=char
    while self.numre.match(s):
      try:
        c=self.getc()
        s=s+c
      except EOF:
        return (self.tokens.NUM, s)
    self.ungetc(c)
    return (self.tokens.NUM, s[:-1])

  def HandleWhitespace(self, char):
    '''Internal function: Handles cases where whitespace is seen'''
    s=char
    while self.wsre.match(s):
      try:
        c=self.getc()
        s=s+c
      except EOF:
        return (self.tokens.WS, None)
    self.ungetc(c)
    return (self.tokens.WS, s[:-1])

  def HandleNewline(self, char):
    '''Internal function: Handles cases where a newline is seen'''
    s=char
    while s==char:
      try:
        s=self.getc()
      except EOF:
        return (self.tokens.NL, None)
    self.ungetc(s)
    return (self.tokens.NL, None)

  def HandleBadChar(self, char):
    '''Internal function: Handles cases where an invalid character is seen'''
    raise BadChar(char)

  def HandleComments(self, token):
    '''Internal function: Handles cases where a comment is seen'''
    if token==self.tokens.LCOM:
      s=''
      while not s=='\n':
        try:
          s=self.getc()
        except EOF:
          return
      return
    elif token==self.tokens.BCOM:
      s=''
      while not s[-2:]=='*/':
        try:
          s=s+self.getc()
        except EOF:
          return

  def HandleString(self, char):
    '''Internal function: Handles cases where a string is seen'''
    s=''
    t=symbols[char]
    state=0
    try:
      c=self.getc()
      if c==char:
        return (t, '')
      while state==1 or not c==char:
        s=s+c
        if c=='\\':
          state=1
        else:
          state=0
        c=self.getc()
      return (t, s)
    except EOF:
      raise StrError, "String runs until EOF"

  def gettoken(self):
    '''Public function: returns a lexed token from the C file
The token has the following format:

  (<tokenid>, <info>)

For numbers, identifiers, and strings, info contains the actual value as a
string. For symbols it is None.'''
    while 1:
      s=self.getc()
      if self.handlers.has_key(s):
        r=getattr(self, self.handlers[s])(s)
        if r and not r[0] in self.comments:
          break
        elif r and r[0] in self.comments:
          self.HandleComments(r[0])
      else:
        self.HandleBadChar(s)
    return r

if __name__=='__main__':
  import StringIO

  a='''int main(void)
{
  int a=0;  // Line Comment
  char *c="123"; /* Block Comment */
};
'''

  b=Clex(StringIO.StringIO(a))
  while 1:
    print b.gettoken()
