123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155 |
- #
- # Copyright (C) 2009-2020 the sqlparse authors and contributors
- # <see AUTHORS file>
- #
- # This module is part of python-sqlparse and is released under
- # the BSD License: https://opensource.org/licenses/BSD-3-Clause
- """SQL Lexer"""
- import re
- # This code is based on the SqlLexer in pygments.
- # http://pygments.org/
- # It's separated from the rest of pygments to increase performance
- # and to allow some customizations.
- from io import TextIOBase
- from sqlparse import tokens, keywords
- from sqlparse.utils import consume
- class Lexer:
- """The Lexer supports configurable syntax.
- To add support for additional keywords, use the `add_keywords` method."""
- _default_intance = None
- # Development notes:
- # - This class is prepared to be able to support additional SQL dialects
- # in the future by adding additional functions that take the place of
- # the function default_initialization()
- # - The lexer class uses an explicit singleton behavior with the
- # instance-getter method get_default_instance(). This mechanism has
- # the advantage that the call signature of the entry-points to the
- # sqlparse library are not affected. Also, usage of sqlparse in third
- # party code does not need to be adapted. On the other hand, singleton
- # behavior is not thread safe, and the current implementation does not
- # easily allow for multiple SQL dialects to be parsed in the same
- # process. Such behavior can be supported in the future by passing a
- # suitably initialized lexer object as an additional parameter to the
- # entry-point functions (such as `parse`). Code will need to be written
- # to pass down and utilize such an object. The current implementation
- # is prepared to support this thread safe approach without the
- # default_instance part needing to change interface.
- @classmethod
- def get_default_instance(cls):
- """Returns the lexer instance used internally
- by the sqlparse core functions."""
- if cls._default_intance is None:
- cls._default_intance = cls()
- cls._default_intance.default_initialization()
- return cls._default_intance
- def default_initialization(self):
- """Initialize the lexer with default dictionaries.
- Useful if you need to revert custom syntax settings."""
- self.clear()
- self.set_SQL_REGEX(keywords.SQL_REGEX)
- self.add_keywords(keywords.KEYWORDS_COMMON)
- self.add_keywords(keywords.KEYWORDS_ORACLE)
- self.add_keywords(keywords.KEYWORDS_PLPGSQL)
- self.add_keywords(keywords.KEYWORDS_HQL)
- self.add_keywords(keywords.KEYWORDS_MSACCESS)
- self.add_keywords(keywords.KEYWORDS)
- def clear(self):
- """Clear all syntax configurations.
- Useful if you want to load a reduced set of syntax configurations.
- After this call, regexps and keyword dictionaries need to be loaded
- to make the lexer functional again."""
- self._SQL_REGEX = []
- self._keywords = []
- def set_SQL_REGEX(self, SQL_REGEX):
- """Set the list of regex that will parse the SQL."""
- FLAGS = re.IGNORECASE | re.UNICODE
- self._SQL_REGEX = [
- (re.compile(rx, FLAGS).match, tt)
- for rx, tt in SQL_REGEX
- ]
- def add_keywords(self, keywords):
- """Add keyword dictionaries. Keywords are looked up in the same order
- that dictionaries were added."""
- self._keywords.append(keywords)
- def is_keyword(self, value):
- """Checks for a keyword.
- If the given value is in one of the KEYWORDS_* dictionary
- it's considered a keyword. Otherwise, tokens.Name is returned.
- """
- val = value.upper()
- for kwdict in self._keywords:
- if val in kwdict:
- return kwdict[val], value
- else:
- return tokens.Name, value
- def get_tokens(self, text, encoding=None):
- """
- Return an iterable of (tokentype, value) pairs generated from
- `text`. If `unfiltered` is set to `True`, the filtering mechanism
- is bypassed even if filters are defined.
- Also preprocess the text, i.e. expand tabs and strip it if
- wanted and applies registered filters.
- Split ``text`` into (tokentype, text) pairs.
- ``stack`` is the initial stack (default: ``['root']``)
- """
- if isinstance(text, TextIOBase):
- text = text.read()
- if isinstance(text, str):
- pass
- elif isinstance(text, bytes):
- if encoding:
- text = text.decode(encoding)
- else:
- try:
- text = text.decode('utf-8')
- except UnicodeDecodeError:
- text = text.decode('unicode-escape')
- else:
- raise TypeError("Expected text or file-like object, got {!r}".
- format(type(text)))
- iterable = enumerate(text)
- for pos, char in iterable:
- for rexmatch, action in self._SQL_REGEX:
- m = rexmatch(text, pos)
- if not m:
- continue
- elif isinstance(action, tokens._TokenType):
- yield action, m.group()
- elif action is keywords.PROCESS_AS_KEYWORD:
- yield self.is_keyword(m.group())
- consume(iterable, m.end() - pos - 1)
- break
- else:
- yield tokens.Error, char
- def tokenize(sql, encoding=None):
- """Tokenize sql.
- Tokenize *sql* using the :class:`Lexer` and return a 2-tuple stream
- of ``(token type, value)`` items.
- """
- return Lexer.get_default_instance().get_tokens(sql, encoding)
|