Clang学习历程 编译过程-词法分析

前言

《编译原理》中提到

编译器的第一个步骤是词法分析(Lexical Analysis)或扫描。词法分析器读入组成源程序的字符流,并且将它们组织成为有意义的词素(lexeme)的序列。对于每个词素,词法分析产生如下形式的词法单元(token)作为输出:
<token-name,attribute-value>
token-name 是一个语法分析步骤要使用的抽象符号
attribute-value指向符号表中关于这个词法单元的条目

实验

int main(){
    @autoreleasepool {
        int initial = 8;
        int six = 6;
        NSString* site = [[NSString alloc] initWithUTF8String:"starming"];
        int rank = initial + six;
        int position = initial + rank * 60;
        NSLog(@"%@ rank %d", site, position);
    }
    return 0;
}
## 使用手工编译的clang执行如下指令
## -fmodules               Enable the 'modules' language feature
## This will make any modules-enabled software libraries available as modules as well as introducing any modules-specific syntax.
## -E                      Only run the preprocessor
## 只运行预处理器
## -Xclang <arg>           Pass <arg> to the clang compiler
## -dump-tokens  -- man clang/ clang --help 都没不到
## 参考1
## http://clang.llvm.org/doxygen/namespaceclang_1_1driver_1_1options.html
## enum clang::driver::options::ClangFlags
## Flags specifically for clang options.

## 参考2
## Running the plugin
## Using the cc1 command line
## To run a plugin, the dynamic library containing the plugin registry # must be loaded via the -load command line option. This will load all plugins that are registered, and you can select the plugins to run by specifying the -plugin option. Additional parameters for the plugins can be passed with -plugin-arg-<plugin-name>.
## Note that those options must reach clang’s cc1 process. There are two ways to do so:

## grep -r "dump-tokens" src/llvm/tools/clang
## src/llvm/tools/clang/include/clang/Driver/CC1Options.td:def dump_tokens : Flag<["-"], "dump-tokens">,
## 根据上面的两个参考链接 + grep的结果确定-dump-tokens应该就是这么来的
## grep -r "dump_tokens" src/llvm/tools/clang
## src/llvm/tools/clang/lib/Frontend/CompilerInvocation.cpp:    case OPT_dump_tokens:
## static InputKind ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
##                                   DiagnosticsEngine &Diags,
##                                   bool &IsHeaderFile) {
## ...
## case OPT_dump_tokens:
##      Opts.ProgramAction = frontend::DumpTokens; break;                                    
~% /opt/llvm/bin/clang -fmodules -E -Xclang -dump-tokens main.m

结果如下

annot_module_include '#import <Foundation/Foundation.h>

int main(int argc, const char * argv[]) {
    @autoreleasepool {
        int initial = 8;
    '		Loc=<main.m:9:1>
int 'int'	 [StartOfLine]	Loc=<main.m:11:1>
identifier 'main'	 [LeadingSpace]	Loc=<main.m:11:5>
l_paren '('		Loc=<main.m:11:9>
int 'int'		Loc=<main.m:11:10>
identifier 'argc'	 [LeadingSpace]	Loc=<main.m:11:14>
comma ','		Loc=<main.m:11:18>
const 'const'	 [LeadingSpace]	Loc=<main.m:11:20>
char 'char'	 [LeadingSpace]	Loc=<main.m:11:26>
star '*'	 [LeadingSpace]	Loc=<main.m:11:31>
identifier 'argv'	 [LeadingSpace]	Loc=<main.m:11:33>
l_square '['		Loc=<main.m:11:37>
r_square ']'		Loc=<main.m:11:38>
r_paren ')'		Loc=<main.m:11:39>
l_brace '{'	 [LeadingSpace]	Loc=<main.m:11:41>
at '@'	 [StartOfLine] [LeadingSpace]	Loc=<main.m:12:5>
identifier 'autoreleasepool'		Loc=<main.m:12:6>
l_brace '{'	 [LeadingSpace]	Loc=<main.m:12:22>
int 'int'	 [StartOfLine] [LeadingSpace]	Loc=<main.m:13:9>
identifier 'initial'	 [LeadingSpace]	Loc=<main.m:13:13>
equal '='	 [LeadingSpace]	Loc=<main.m:13:21>
numeric_constant '8'	 [LeadingSpace]	Loc=<main.m:13:23>
semi ';'		Loc=<main.m:13:24>
int 'int'	 [StartOfLine] [LeadingSpace]	Loc=<main.m:14:9>
identifier 'six'	 [LeadingSpace]	Loc=<main.m:14:13>
equal '='	 [LeadingSpace]	Loc=<main.m:14:17>
numeric_constant '6'	 [LeadingSpace]	Loc=<main.m:14:19>
semi ';'		Loc=<main.m:14:20>
identifier 'NSString'	 [StartOfLine] [LeadingSpace]	Loc=<main.m:15:9>
star '*'		Loc=<main.m:15:17>
identifier 'site'	 [LeadingSpace]	Loc=<main.m:15:19>
equal '='	 [LeadingSpace]	Loc=<main.m:15:24>
l_square '['	 [LeadingSpace]	Loc=<main.m:15:26>
l_square '['		Loc=<main.m:15:27>
identifier 'NSString'		Loc=<main.m:15:28>
identifier 'alloc'	 [LeadingSpace]	Loc=<main.m:15:37>
r_square ']'		Loc=<main.m:15:42>
identifier 'initWithUTF8String'	 [LeadingSpace]	Loc=<main.m:15:44>
colon ':'		Loc=<main.m:15:62>
string_literal '"starming"'		Loc=<main.m:15:63>
r_square ']'		Loc=<main.m:15:73>
semi ';'		Loc=<main.m:15:74>
int 'int'	 [StartOfLine] [LeadingSpace]	Loc=<main.m:16:9>
identifier 'rank'	 [LeadingSpace]	Loc=<main.m:16:13>
equal '='	 [LeadingSpace]	Loc=<main.m:16:18>
identifier 'initial'	 [LeadingSpace]	Loc=<main.m:16:20>
plus '+'	 [LeadingSpace]	Loc=<main.m:16:28>
identifier 'six'	 [LeadingSpace]	Loc=<main.m:16:30>
semi ';'		Loc=<main.m:16:33>
int 'int'	 [StartOfLine] [LeadingSpace]	Loc=<main.m:17:9>
identifier 'position'	 [LeadingSpace]	Loc=<main.m:17:13>
equal '='	 [LeadingSpace]	Loc=<main.m:17:22>
identifier 'initial'	 [LeadingSpace]	Loc=<main.m:17:24>
plus '+'	 [LeadingSpace]	Loc=<main.m:17:32>
identifier 'rank'	 [LeadingSpace]	Loc=<main.m:17:34>
star '*'	 [LeadingSpace]	Loc=<main.m:17:39>
numeric_constant '60'	 [LeadingSpace]	Loc=<main.m:17:41>
semi ';'		Loc=<main.m:17:43>
identifier 'NSLog'	 [StartOfLine] [LeadingSpace]	Loc=<main.m:18:9>
l_paren '('		Loc=<main.m:18:14>
at '@'		Loc=<main.m:18:15>
string_literal '"%@ rank %d"'		Loc=<main.m:18:16>
comma ','		Loc=<main.m:18:28>
identifier 'site'	 [LeadingSpace]	Loc=<main.m:18:30>
comma ','		Loc=<main.m:18:34>
identifier 'position'	 [LeadingSpace]	Loc=<main.m:18:36>
r_paren ')'		Loc=<main.m:18:44>
semi ';'		Loc=<main.m:18:45>
r_brace '}'	 [StartOfLine] [LeadingSpace]	Loc=<main.m:19:5>
return 'return'	 [StartOfLine] [LeadingSpace]	Loc=<main.m:20:5>
numeric_constant '0'	 [LeadingSpace]	Loc=<main.m:20:12>
semi ';'		Loc=<main.m:20:13>
r_brace '}'	 [StartOfLine]	Loc=<main.m:21:1>
eof ''		Loc=<main.m:21:2>
## 《编译原理》给的例子
position = initial + rate * 60
## 对应词法序列
<id,1> < = > <id,2> < + > <id,3> < * ><60>

Clang学习历程 编译过程-词法分析

## int position = initial + rank * 60;
int 'int'	 [StartOfLine] [LeadingSpace]	Loc=<main.m:17:9>
identifier 'position'	 [LeadingSpace]	Loc=<main.m:17:13>
equal '='	 [LeadingSpace]	Loc=<main.m:17:22>
identifier 'initial'	 [LeadingSpace]	Loc=<main.m:17:24>
plus '+'	 [LeadingSpace]	Loc=<main.m:17:32>
identifier 'rank'	 [LeadingSpace]	Loc=<main.m:17:34>
star '*'	 [LeadingSpace]	Loc=<main.m:17:39>
numeric_constant '60'	 [LeadingSpace]	Loc=<main.m:17:41>
semi ';'		Loc=<main.m:17:43>

## 可以获得每个 token 的类型,值还有类似 StartOfLine 的位置类型和 Loc=main.m:11:1 这个样的具体位置。
## 和《编译原理》有点不同,attribute-value没有指向符号表中关于这个词法单元的条目

实现

  1. 定义词元
  2. 遍历字符流 && 输出词元

案例

定义词元

入门教程中中的Kaleidoscope语言

//===----------------------------------------------------------------------===//
// Lexer
//===----------------------------------------------------------------------===//

// The lexer returns tokens [0-255] if it is an unknown character, otherwise one
// of these for known things.
// 字符流解析词元规则,要么是如下5种类型,要么返回对应的ASCII值
enum Token {
  tok_eof = -1,

  // commands
  tok_def = -2, tok_extern = -3,

  // primary
  tok_identifier = -4, tok_number = -5
};

static std::string IdentifierStr;  // Filled in if tok_identifier
static double NumVal;              // Filled in if tok_number

遍历字符流 && 输出词元

/// gettok - Return the next token from standard input.
static int gettok() {
  static int LastChar = ' ';

  // Skip any whitespace.
  /// 忽略空格
  while (isspace(LastChar))
    LastChar = getchar();

  /// 判定是否是identifier 满足正则条件[a-zA-Z][a-zA-Z0-9]*
  if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9]*
    IdentifierStr = LastChar;
    while (isalnum((LastChar = getchar())))
      IdentifierStr += LastChar;
    
    /// 排除保留的关键字
    if (IdentifierStr == "def") return tok_def;
    if (IdentifierStr == "extern") return tok_extern;
    return tok_identifier;
  }

  /// 判定是否是数字 满足正则条件 [0-9.]+
  if (isdigit(LastChar) || LastChar == '.') {   // Number: [0-9.]+
    std::string NumStr;
    do {
      NumStr += LastChar;
      LastChar = getchar();
    } while (isdigit(LastChar) || LastChar == '.');

    NumVal = strtod(NumStr.c_str(), 0);
    return tok_number;
  }

  /// 判定是否是注释
  if (LastChar == '#') {
    // Comment until end of line.
    do LastChar = getchar();
    while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
    
    if (LastChar != EOF)
      return gettok();
  }

  // Check for end of file.  Don't eat the EOF.
  if (LastChar == EOF)
    return tok_eof;

  // Otherwise, just return the character as its ascii value.
  /// 返回字符的ascii值
  int ThisChar = LastChar;
  LastChar = getchar();
  return ThisChar;
}

理论

正则表达式

正则表达式(Regular Expression,RE)是一种用来描述正则语言的更紧凑的表达方式。

每个正则表达式 r 定义(表示)一个语言,记为 L(r)。这个语言也是根据 r 的子表达式所表示的语言递归定义的。

Σ 是给定的有限字符集
ε 是空串(empty string)

归纳基础:

  1. ε是一个正则表达式,L(ε) = {ε},即该语言只包含空串
  2. 如果 a ∈ Σ,那么a是一个正则表达式,且L(a) = {a}。即这个语言仅包含一个长度为1的符号串a

归纳步骤:假设 r 和 s 都是正则表达式,表示的语言分别是L®和L(s)

  1. r|s 是一个正则表达式,L(r|s) = L(r)∪L(s)
  2. rs(rs 的连接)是一个正则表达式,L(rs)=L(r)L(s)
  3. r* 是一个正则表达式,L(r*) = (L(r))*
  4. (r)是一个正则表达式,L((r)) = L(r),表明表达式的两边加上括号并不影响表达式所表示的语言

运算符的优先级: * , 连接 , |
* 号代表字符可以不出现,也可以出现一次或者多次

有穷自动机

正则表达式描述的规则人容易理解,但是要解析字符串,还需要将其转化为计算机程序能理解的模型。

有穷自动机(Finite Automata,FA)是对一类处理系统建立的数学模型。这类系统具有一系列离散的输入输出信息有穷数目的内部状态

数学表达:

M = (S,Σ,δ,s0,F)

S: 有穷状态集
Σ: 字符表
δ: 转换函数
s0: 初始状态
F: 结束/可接受状态集

表示方法

有穷自动机可以用转换图来表示。

  • 初始状态(开始状态):只有一个,有 start 箭头指向
  • 终止状态/可接受状态:可以有多个,用双圈表示

Clang学习历程 编译过程-词法分析

分类

  • 确定的有穷自动机(DFA)
  • 不确定的有穷自动机(NFA)

非确定有穷自动机NFA 和确定的有穷自动机DFA 唯一的区别是:从状态 s 出发,能到达的状态可能有多个。(并不是唯一确定的)

因此,转换函数为集合,而不是元素。

DFA

Clang学习历程 编译过程-词法分析

NFA
Clang学习历程 编译过程-词法分析

对应的正则表达式是 (a|b)*abb

从正则表达式到有穷自动机

因为NFA 同一输入导出多种状态,判定接受比较麻烦,因此,一般会将NFA转换为DFA来实现

简而言之,要做的就是 RE -> NFA -> DFA

Thompson 算法 (RENFA)

正则表达式是递归定义的,通过
不停的分解子表达式,即可求得最终的 NFA。

Clang学习历程 编译过程-词法分析

比如

r=(a|b)*abb 对应的 NFA

Clang学习历程 编译过程-词法分析

子集构造法 (NFADFA)

子集构造法的基本思想是让构造得到的DFA的每个状态对应于NFA的一个状态集合

正则表达式 (a|b)*abb


状态0 记为 s0

s0 输入 a 可到达状态{0,1} 记为s1
s0 -> a = {0,1} = s1

s0 输入 b 可到到状态{0} 即s0
s0 -> b = {0} = s0

s1 输入 a 
当处于状态0时,可到达{0,1}
当处于状态1时,可到达∅ 两者取并集
可到达状态{0,1} 即s1
(s1,a) -> {0,1} = s1

s1 输入 b 可到达状态{0,2} 记为s2
(s1,b) -> {0,2} = s2

s2 输入 a
当处于状态0时,可到达{0,1}
当处于状态2时,可到达∅,两者取并集
可到达状态{0,1} 即s1
s2 输入 a 可到达{0,1}
(s2,a) -> {0,1} = s1

s2 输入 b 
当处于状态0时,可到达{0}
当处于状态2时,可到达{3},两者取并集
可到达状态{0,3},记为s3
(s2,b) -> {0,3} = s3

s3 输入 a
0 -> {0,1}
3 -> ∅
{0,1} U ∅ => {0,1} = s1
(s3,a) -> {0,1} = s1

s3 输入 b
0 -> {0}
3 -> ∅
{0} U ∅ => {0} = s0
(s3,b) -> s0

到s3继续输入a|b,回到s0/s1,没有新的状态,再输入只会重新执行上面的过程,因此构造结束,包含状态3的是终止节点

对上面的过程进行梳理可得

s0 是初始状态
s0 输入 b 到 s0 状态
s0 输入 a 到 s1 状态

s1 输入 a 到 s1 状态
s1 输入 b 到 s2 状态

s2 输入 a 到 s1 状态
s2 输入 b 到 s3 状态

s3 输入 a 到 s1 状态
s3 输入 b 到 s0 状态

整个思考过程处理成表格的形式

状态集\条件 a b
0 {0,1} => s1 {0} => s0
1 {0,1} => s1 {2} => s2
2 {0,1} => s1 {0,3} => s3
3 {0,1} => s1 {0} => s0

Clang学习历程 编译过程-词法分析

另外一种情况 NFA 中包含 ε 边比如参考链接9中举的例子

正则表达式为 a (b|c)*

Clang学习历程 编译过程-词法分析

字符集为{a,b,c}

设s0 = n0
s0 输入a
可到达状态{n1} ``ε`` 不消耗代价,可以略过,因此实际可到达为 {n1,n2,n3,n4,n6,n9} 记为 s1
状态s0 通过ε转换到达的NFA状态集合
ε.closure(s0) = {n1,n2,n3,n4,n6,n9} = s1 终止状态(包含了n9)

观察上面的图+思考比较容易判断出,哪些情况下输入b,c得到的结果不会是∅ ,即有效的
s1 输入b
ε.closure(Move(s1,b)) = {n5,n8,n9,n3,n4,n6} = s2 终止状态

s1 输入c
ε.closure(Move(s1,c)) = {n7,n8,n9,n3,n4,n6} = s3 终止状态

s2 输入b
ε.closure(Move(s2,b)) = {n5,n8,n9,n3,n4,n6} = s2
s2 输入 c
ε.closure(Move(s2,c)) = {n7,n8,n9,n3,n4,n6} = s3

s3 输入 b
ε.closure(Move(s3,b)) = {n5,n8,n9,n3,n4,n6} = s2
s3 输入 c
ε.closure(Move(s3,c)) = {n5,n8,n9,n3,n4,n6} = s3
状态集\条件 a b c
s0 {n1,n2,n3,n4,n6,n9} => s1
s1 {n5,n8,n9,n3,n4,n6} => s2 {n7,n8,n9,n3,n4,n6} => s3
s2 {n5,n8,n9,n3,n4,n6} => s2 {n7,n8,n9,n3,n4,n6} => s3
s3 {n5,n8,n9,n3,n4,n6} => s2 {n7,n8,n9,n3,n4,n6} => s3

Clang学习历程 编译过程-词法分析

参考链接9

/// 子集构造算法: 工作表算法 
/// 初始状态
s0 <- eps_closure(n0)
Q <- {s0}
/// 将初始状态添加到工作表中
workList <- s0
while (workList != [])
	/// 将该状态添加到
	remove s from workList
	foreach (character c)
	   /**
	    1.遍历状态集s在输入条件c下得到的状态集合
	    2.通过e-closure扩展边界,然后取并集,得到最终的状态集
	    */
		t <- e-closure(delta(s,c))
		D[s,c] <- t
		/** 
		判断标准: 经过e-closure 转换,发现新的状态,继续进行子集构造操作,
		否则证明这条分支的转换结束,继续判断工作列表中是否为空
		*/
		if (t not in Q)
			add t to Q and workList

因为有限集合的子集是有限的,N个元素的集合,最多有2^N个子集,每次遍历都会将状态加入到Q中,所以循环最终会结束。

DFA的状态数有可能是NFA状态数的指数,在这种情况下,我们在试图实现这个DFA时会遇到困难。然而,基于自动机的词法分析方法的处理部分源于如下事实:对于一个真实的语言,它的NFADFA的状态数量大致相同,状态数量呈指数关系的情形尚未在实践中出现过
— 《编译原理》

Hopcroft 算法 (最小化 DFA)

经过前面两步,已经将 RE,转换成DFA,但是还需要进行最小化的操作

参考链接9

split(s)
	foreach(character c)	
	   /**
	    输入字符c到状态集合s中,每种状态输出的状态集存在不一致的情况,证明字符c可以切割这个状态集合s
	    */
		if (c can split s)
			split s into T1,...,Tk
	
hopcroft()
  /**
  首次划分: N非结束状态,A结束状态
  */
	split all nodes into N,A
	while (set is still changes)
		split(s)

Clang学习历程 编译过程-词法分析

将所有终止状态合并为状态s4

该状态s4接收字符b,c

用字符b尝试切割

状态1 输入 b 得到状态2 在状态s4中
状态2 输入 b 得到状态2 在状态s4中
状态3 输入 b 空集 忽略

无法通过字符b找到一个不属于状态s4的状态

对字符c同理,因此可以合并

Clang学习历程 编译过程-词法分析

Clang 中的实现

要解决的问题是: Clang 是如何进行词法分析的?

如何使用lldb 调试Clang 工程

在上面的实验分析,发现终端在执行clang 指令时,最终会传递到CompilerInvocation.cpp 这个文件的ParseFrontendArgs方法中。
指令的完成后,进程就结束了,并不像iOS 应用会有运行循环。

因此在CompilerInvocation.cpp文件中

引入 #include <unistd.h> 头文件
添加延时的操作
case OPT_dump_tokens:    
    Opts.ProgramAction = frontend::DumpTokens; 
    sleep(20);    
    break;

重新
sudo ninja -j4 install

窗口A执行

## 定位到工程目录中
/opt/llvm/bin/clang-8 -fmodules -E -Xclang -dump-tokens main.mm

窗口B执行

~ % ps -ef | grep clang
## 会有类似下面的输出
% ps -ef | grep clang-8
  501   497  3105   0 11:11AM ttys000    0:00.01 grep --color=auto --exclude-dir=.bzr --exclude-dir=CVS --exclude-dir=.git --exclude-dir=.hg --exclude-dir=.svn clang-8
  501   384 13315   0 11:11AM ttys001    0:00.06 /opt/llvm/bin/clang-8 -fmodules -E -Xclang -dump-tokens main.mm
  501   392   384   0 11:11AM ttys001    0:00.01 /opt/llvm/bin/clang-8 -cc1 -triple x86_64-apple-macosx10.13.0 -Wdeprecated-objc-isa-usage -Werror=deprecated-objc-isa-usage -E -disable-free -disable-llvm-verifier -discard-value-names -main-file-name main.mm -mrelocation-model pic -pic-level 2 -mthread-model posix -mdisable-fp-elim -masm-verbose -munwind-tables -target-cpu penryn -dwarf-column-info -debugger-tuning=lldb -ggnu-pubnames -target-linker-version 409.12 -resource-dir /opt/llvm/lib/clang/8.0.0 -stdlib=libc++ -internal-isystem /opt/llvm/include/c++/v1 -fdeprecated-macro -fdebug-compilation-dir /Users/Jason/Desktop/TryLLVM/prj/LLVM_02/LLVM_02 -ferror-limit 19 -fmessage-length 158 -stack-protector 1 -fblocks -fencode-extended-block-signature -fmodules -fimplicit-module-maps -fmodules-cache-path=/var/folders/lk/znn1qp4925j412wt4t_qkplc0000gn/C/org.llvm.clang.Jason/ModuleCache -fmodules-validate-system-headers -fregister-global-dtors-with-atexit -fobjc-runtime=macosx-10.13.0 -fobjc-exceptions -fcxx-exceptions -fexceptions -fmax-type-align=16 -fdiagnostics-show-option -fcolor-diagnostics -dump-tokens -o - -x objective-c++ main.mm

窗口C执行

## 进入lldb 交互环境
~% lldb 
## attach到对应的进程上 选带cc1的那个进程
~% process attach --pid 392
## 查看调用栈
~% bt all

Clang学习历程 编译过程-词法分析

查看源代码,经过整理后,大致是这样的

## CompilerInvocation.cpp
CompilerInvocation::CreateFromArgs 调用
ParseFrontendArgs 调用
## FrontendAction.cpp
DumpTokensAction::ExecuteAction
void DumpTokensAction::ExecuteAction() {
  Preprocessor &PP = getCompilerInstance().getPreprocessor();
  // Start preprocessing the specified input file.
  Token Tok;
  PP.EnterMainSourceFile();
  do {
    PP.Lex(Tok);
    PP.DumpToken(Tok, true);
    llvm::errs() << "\n";
  } while (Tok.isNot(tok::eof));
}
void Preprocessor::Lex(Token &Result) {
  // We loop here until a lex function returns a token; this avoids recursion.
  bool ReturnedToken;
  do {
    switch (CurLexerKind) {
    case CLK_Lexer:
      ReturnedToken = CurLexer->Lex(Result);
      break;
    case CLK_TokenLexer:
      ReturnedToken = CurTokenLexer->Lex(Result);
      break;
    case CLK_CachingLexer:
      CachingLex(Result);
      ReturnedToken = true;
      break;
    case CLK_LexAfterModuleImport:
      LexAfterModuleImport(Result);
      ReturnedToken = true;
      break;
    }
  } while (!ReturnedToken);

  if (Result.is(tok::code_completion) && Result.getIdentifierInfo()) {
    // Remember the identifier before code completion token.
    setCodeCompletionIdentifierInfo(Result.getIdentifierInfo());
    setCodeCompletionTokenRange(Result.getLocation(), Result.getEndLoc());
    // Set IdenfitierInfo to null to avoid confusing code that handles both
    // identifiers and completion tokens.
    Result.setIdentifierInfo(nullptr);
  }

  LastTokenWasAt = Result.is(tok::at);
}
void Preprocessor::DumpToken(const Token &Tok, bool DumpFlags) const {
  llvm::errs() << tok::getTokenName(Tok.getKind()) << " '"
               << getSpelling(Tok) << "'";

  if (!DumpFlags) return;

  llvm::errs() << "\t";
  if (Tok.isAtStartOfLine())
    llvm::errs() << " [StartOfLine]";
  if (Tok.hasLeadingSpace())
    llvm::errs() << " [LeadingSpace]";
  if (Tok.isExpandDisabled())
    llvm::errs() << " [ExpandDisabled]";
  if (Tok.needsCleaning()) {
    const char *Start = SourceMgr.getCharacterData(Tok.getLocation());
    llvm::errs() << " [UnClean='" << StringRef(Start, Tok.getLength())
                 << "']";
  }

  llvm::errs() << "\tLoc=<";
  DumpLocation(Tok.getLocation());
  llvm::errs() << ">";
}

CompilerInvocation::CreateFromArgs 方法上也做类似延时的操作,然后设置断点

(lldb) br set --name ParseFrontendArgs

Clang学习历程 编译过程-词法分析

参数Token的结构如下

namespace clang {

class IdentifierInfo;

/// Token - This structure provides full information about a lexed token.
/// It is not intended to be space efficient, it is intended to return as much
/// information as possible about each returned token.  This is expected to be
/// compressed into a smaller form if memory footprint is important.
///
/// The parser can create a special "annotation token" representing a stream of
/// tokens that were parsed and semantically resolved, e.g.: "foo::MyClass<int>"
/// can be represented by a single typename annotation token that carries
/// information about the SourceRange of the tokens and the type object.
class Token {
  /// The location of the token. This is actually a SourceLocation.
  unsigned Loc;

  // Conceptually these next two fields could be in a union.  However, this
  // causes gcc 4.2 to pessimize LexTokenInternal, a very performance critical
  // routine. Keeping as separate members with casts until a more beautiful fix
  // presents itself.

  /// UintData - This holds either the length of the token text, when
  /// a normal token, or the end of the SourceRange when an annotation
  /// token.
  /// token 的长度
  unsigned UintData;

  /// PtrData - This is a union of four different pointer types, which depends
  /// on what type of token this is:
  ///  Identifiers, keywords, etc:
  ///    This is an IdentifierInfo*, which contains the uniqued identifier
  ///    spelling.
  ///  Literals:  isLiteral() returns true.
  ///    This is a pointer to the start of the token in a text buffer, which
  ///    may be dirty (have trigraphs / escaped newlines).
  ///  Annotations (resolved type names, C++ scopes, etc): isAnnotation().
  ///    This is a pointer to sema-specific data for the annotation token.
  ///  Eof:
  //     This is a pointer to a Decl.
  ///  Other:
  ///    This is null.
  /// token的内容
  void *PtrData;

  /// Kind - The actual flavor of token this is.
  /// token的类型
  tok::TokenKind Kind;

  /// Flags - Bits we track about this token, members of the TokenFlags enum.
  unsigned short Flags;

public:  
	...

tok::TokenKind 的声明

TokenKinds.h

namespace tok {

/// Provides a simple uniform namespace for tokens from all C languages.
enum TokenKind : unsigned short {
#define TOK(X) X,
#include "clang/Basic/TokenKinds.def"
  NUM_TOKENS
};

/// Provides a namespace for preprocessor keywords which start with a
/// '#' at the beginning of the line.
enum PPKeywordKind {
#define PPKEYWORD(X) pp_##X,
#include "clang/Basic/TokenKinds.def"
  NUM_PP_KEYWORDS
};

/// Provides a namespace for Objective-C keywords which start with
/// an '@'.
enum ObjCKeywordKind {
#define OBJC_AT_KEYWORD(X) objc_##X,
#include "clang/Basic/TokenKinds.def"
  NUM_OBJC_KEYWORDS
};

// clang/Basic/TokenKinds.def
...
// C99 6.4.2: Identifiers.
TOK(identifier)          // abcde123
TOK(raw_identifier)      // Used only in raw lexing mode.

// C99 6.4.4.1: Integer Constants
// C99 6.4.4.2: Floating Constants
TOK(numeric_constant)    // 0x123

// C99 6.4.4: Character Constants
TOK(char_constant)       // 'a'
TOK(wide_char_constant)  // L'b'

// C++17 Character Constants
TOK(utf8_char_constant)  // u8'a'

// C++11 Character Constants
TOK(utf16_char_constant) // u'a'
TOK(utf32_char_constant) // U'a'

// C99 6.4.5: String Literals.
TOK(string_literal)      // "foo"
TOK(wide_string_literal) // L"foo"
TOK(angle_string_literal)// <foo>

// C++11 String Literals.
TOK(utf8_string_literal) // u8"foo"
TOK(utf16_string_literal)// u"foo"
TOK(utf32_string_literal)// U"foo"

// C99 6.4.6: Punctuators.
PUNCTUATOR(l_square,            "[")
PUNCTUATOR(r_square,            "]")
PUNCTUATOR(l_paren,             "(")
PUNCTUATOR(r_paren,             ")")
PUNCTUATOR(l_brace,             "{")
PUNCTUATOR(r_brace,             "}")
PUNCTUATOR(period,              ".")
PUNCTUATOR(ellipsis,            "...")
PUNCTUATOR(amp,                 "&")
PUNCTUATOR(ampamp,              "&&")
PUNCTUATOR(ampequal,            "&=")
PUNCTUATOR(star,                "*")
PUNCTUATOR(starequal,           "*=")
PUNCTUATOR(plus,                "+")
PUNCTUATOR(plusplus,            "++")
PUNCTUATOR(plusequal,           "+=")
...
/// Lexer.cpp
/// LexTokenInternal - This implements a simple C family lexer.  It is an
/// extremely performance critical piece of code.  This assumes that the buffer
/// has a null character at the end of the file.  This returns a preprocessing
/// token, not a normal token, as such, it is an internal interface.  It assumes
/// that the Flags of result have been cleared before calling this.
/// 词法分析的方法
bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine)

...
// Read a character, advancing over it.
  char Char = getAndAdvanceChar(CurPtr, Result);
  tok::TokenKind Kind;
/// 遍历字符 
  switch (Char) {
...
case '\n':
    // If we are inside a preprocessor directive and we see the end of line,
    // we know we are done with the directive, so return an EOD token.
    if (ParsingPreprocessorDirective) {
      // Done parsing the "line".
      ParsingPreprocessorDirective = false;

      // Restore comment saving mode, in case it was disabled for directive.
      if (PP)
        resetExtendedTokenMode();

      // Since we consumed a newline, we are back at the start of a line.
      IsAtStartOfLine = true;
      IsAtPhysicalStartOfLine = true;

		 /// 比如\n类型就判断tok::end类型
      Kind = tok::eod;
      break;
    }
...
   // C99 6.4.4.1: Integer Constants.
   // C99 6.4.4.2: Floating Constants.
   // 整形常量
  case '0': case '1': case '2': case '3': case '4':
  case '5': case '6': case '7': case '8': case '9':
    // Notify MIOpt that we read a non-whitespace/non-comment token.
    MIOpt.ReadToken();
    return LexNumericConstant(Result, CurPtr);
...
	// C99 6.4.2: Identifiers. 是不是标识符
  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
  case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
  case 'O': case 'P': case 'Q':    /*'R'*/case 'S': case 'T':    /*'U'*/
  case 'V': case 'W': case 'X': case 'Y': case 'Z':
  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
  case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
  case 'o': case 'p': case 'q': case 'r': case 's': case 't':    /*'u'*/
  case 'v': case 'w': case 'x': case 'y': case 'z':
  case '_':
    // Notify MIOpt that we read a non-whitespace/non-comment token.
    MIOpt.ReadToken();
    return LexIdentifier(Result, CurPtr);

/// LexNumericConstant - Lex the remainder of a integer or floating point
/// constant. From[-1] is the first character lexed.  Return the end of the
/// constant.
bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
...
/// 类型设置为 tok::numeric_constant
FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
...

/// 判断是否是标识符
bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
  // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
  unsigned Size;
  unsigned char C = *CurPtr++;
...

参考

  1. LLVM Tutorial
  2. 第一章 教程简介与词法分析器
  3. 深入剖析 iOS 编译 Clang LLVM
  4. Lexical_analysis
  5. 编译原理(3):词法分析
  6. Wiki Regular expression
  7. 编译原理:有穷自动机(DFA与NFA)
  8. 【编译原理】:NFA转变为DFA的子集构造法
  9. 网易云课堂 - 编译原理
  10. 状态图制作网站
  11. 64位汇编参数传递