Clang学习历程 编译过程-词法分析
前言
《编译原理》中提到
编译器的第一个步骤是词法分析(Lexical Analysis)或扫描。词法分析器读入组成源程序的字符流,并且将它们组织成为有意义的词素(lexeme)的序列。对于每个词素,词法分析产生如下形式的词法单元(token)作为输出:
<token-name,attribute-value>
token-name
是一个语法分析步骤要使用的抽象符号attribute-value
指向符号表中关于这个词法单元的条目
实验
int main(){
@autoreleasepool {
int initial = 8;
int six = 6;
NSString* site = [[NSString alloc] initWithUTF8String:"starming"];
int rank = initial + six;
int position = initial + rank * 60;
NSLog(@"%@ rank %d", site, position);
}
return 0;
}
## 使用手工编译的clang执行如下指令
## -fmodules Enable the 'modules' language feature
## This will make any modules-enabled software libraries available as modules as well as introducing any modules-specific syntax.
## -E Only run the preprocessor
## 只运行预处理器
## -Xclang <arg> Pass <arg> to the clang compiler
## -dump-tokens -- man clang/ clang --help 都没不到
## 参考1
## http://clang.llvm.org/doxygen/namespaceclang_1_1driver_1_1options.html
## enum clang::driver::options::ClangFlags
## Flags specifically for clang options.
## 参考2
## Running the plugin
## Using the cc1 command line
## To run a plugin, the dynamic library containing the plugin registry # must be loaded via the -load command line option. This will load all plugins that are registered, and you can select the plugins to run by specifying the -plugin option. Additional parameters for the plugins can be passed with -plugin-arg-<plugin-name>.
## Note that those options must reach clang’s cc1 process. There are two ways to do so:
## grep -r "dump-tokens" src/llvm/tools/clang
## src/llvm/tools/clang/include/clang/Driver/CC1Options.td:def dump_tokens : Flag<["-"], "dump-tokens">,
## 根据上面的两个参考链接 + grep的结果确定-dump-tokens应该就是这么来的
## grep -r "dump_tokens" src/llvm/tools/clang
## src/llvm/tools/clang/lib/Frontend/CompilerInvocation.cpp: case OPT_dump_tokens:
## static InputKind ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
## DiagnosticsEngine &Diags,
## bool &IsHeaderFile) {
## ...
## case OPT_dump_tokens:
## Opts.ProgramAction = frontend::DumpTokens; break;
~% /opt/llvm/bin/clang -fmodules -E -Xclang -dump-tokens main.m
结果如下
annot_module_include '#import <Foundation/Foundation.h>
int main(int argc, const char * argv[]) {
@autoreleasepool {
int initial = 8;
' Loc=<main.m:9:1>
int 'int' [StartOfLine] Loc=<main.m:11:1>
identifier 'main' [LeadingSpace] Loc=<main.m:11:5>
l_paren '(' Loc=<main.m:11:9>
int 'int' Loc=<main.m:11:10>
identifier 'argc' [LeadingSpace] Loc=<main.m:11:14>
comma ',' Loc=<main.m:11:18>
const 'const' [LeadingSpace] Loc=<main.m:11:20>
char 'char' [LeadingSpace] Loc=<main.m:11:26>
star '*' [LeadingSpace] Loc=<main.m:11:31>
identifier 'argv' [LeadingSpace] Loc=<main.m:11:33>
l_square '[' Loc=<main.m:11:37>
r_square ']' Loc=<main.m:11:38>
r_paren ')' Loc=<main.m:11:39>
l_brace '{' [LeadingSpace] Loc=<main.m:11:41>
at '@' [StartOfLine] [LeadingSpace] Loc=<main.m:12:5>
identifier 'autoreleasepool' Loc=<main.m:12:6>
l_brace '{' [LeadingSpace] Loc=<main.m:12:22>
int 'int' [StartOfLine] [LeadingSpace] Loc=<main.m:13:9>
identifier 'initial' [LeadingSpace] Loc=<main.m:13:13>
equal '=' [LeadingSpace] Loc=<main.m:13:21>
numeric_constant '8' [LeadingSpace] Loc=<main.m:13:23>
semi ';' Loc=<main.m:13:24>
int 'int' [StartOfLine] [LeadingSpace] Loc=<main.m:14:9>
identifier 'six' [LeadingSpace] Loc=<main.m:14:13>
equal '=' [LeadingSpace] Loc=<main.m:14:17>
numeric_constant '6' [LeadingSpace] Loc=<main.m:14:19>
semi ';' Loc=<main.m:14:20>
identifier 'NSString' [StartOfLine] [LeadingSpace] Loc=<main.m:15:9>
star '*' Loc=<main.m:15:17>
identifier 'site' [LeadingSpace] Loc=<main.m:15:19>
equal '=' [LeadingSpace] Loc=<main.m:15:24>
l_square '[' [LeadingSpace] Loc=<main.m:15:26>
l_square '[' Loc=<main.m:15:27>
identifier 'NSString' Loc=<main.m:15:28>
identifier 'alloc' [LeadingSpace] Loc=<main.m:15:37>
r_square ']' Loc=<main.m:15:42>
identifier 'initWithUTF8String' [LeadingSpace] Loc=<main.m:15:44>
colon ':' Loc=<main.m:15:62>
string_literal '"starming"' Loc=<main.m:15:63>
r_square ']' Loc=<main.m:15:73>
semi ';' Loc=<main.m:15:74>
int 'int' [StartOfLine] [LeadingSpace] Loc=<main.m:16:9>
identifier 'rank' [LeadingSpace] Loc=<main.m:16:13>
equal '=' [LeadingSpace] Loc=<main.m:16:18>
identifier 'initial' [LeadingSpace] Loc=<main.m:16:20>
plus '+' [LeadingSpace] Loc=<main.m:16:28>
identifier 'six' [LeadingSpace] Loc=<main.m:16:30>
semi ';' Loc=<main.m:16:33>
int 'int' [StartOfLine] [LeadingSpace] Loc=<main.m:17:9>
identifier 'position' [LeadingSpace] Loc=<main.m:17:13>
equal '=' [LeadingSpace] Loc=<main.m:17:22>
identifier 'initial' [LeadingSpace] Loc=<main.m:17:24>
plus '+' [LeadingSpace] Loc=<main.m:17:32>
identifier 'rank' [LeadingSpace] Loc=<main.m:17:34>
star '*' [LeadingSpace] Loc=<main.m:17:39>
numeric_constant '60' [LeadingSpace] Loc=<main.m:17:41>
semi ';' Loc=<main.m:17:43>
identifier 'NSLog' [StartOfLine] [LeadingSpace] Loc=<main.m:18:9>
l_paren '(' Loc=<main.m:18:14>
at '@' Loc=<main.m:18:15>
string_literal '"%@ rank %d"' Loc=<main.m:18:16>
comma ',' Loc=<main.m:18:28>
identifier 'site' [LeadingSpace] Loc=<main.m:18:30>
comma ',' Loc=<main.m:18:34>
identifier 'position' [LeadingSpace] Loc=<main.m:18:36>
r_paren ')' Loc=<main.m:18:44>
semi ';' Loc=<main.m:18:45>
r_brace '}' [StartOfLine] [LeadingSpace] Loc=<main.m:19:5>
return 'return' [StartOfLine] [LeadingSpace] Loc=<main.m:20:5>
numeric_constant '0' [LeadingSpace] Loc=<main.m:20:12>
semi ';' Loc=<main.m:20:13>
r_brace '}' [StartOfLine] Loc=<main.m:21:1>
eof '' Loc=<main.m:21:2>
## 《编译原理》给的例子
position = initial + rate * 60
## 对应词法序列
<id,1> < = > <id,2> < + > <id,3> < * ><60>
## int position = initial + rank * 60;
int 'int' [StartOfLine] [LeadingSpace] Loc=<main.m:17:9>
identifier 'position' [LeadingSpace] Loc=<main.m:17:13>
equal '=' [LeadingSpace] Loc=<main.m:17:22>
identifier 'initial' [LeadingSpace] Loc=<main.m:17:24>
plus '+' [LeadingSpace] Loc=<main.m:17:32>
identifier 'rank' [LeadingSpace] Loc=<main.m:17:34>
star '*' [LeadingSpace] Loc=<main.m:17:39>
numeric_constant '60' [LeadingSpace] Loc=<main.m:17:41>
semi ';' Loc=<main.m:17:43>
## 可以获得每个 token 的类型,值还有类似 StartOfLine 的位置类型和 Loc=main.m:11:1 这个样的具体位置。
## 和《编译原理》有点不同,attribute-value没有指向符号表中关于这个词法单元的条目
实现
- 定义词元
- 遍历字符流 && 输出词元
案例
定义词元
入门教程中中的Kaleidoscope
语言
//===----------------------------------------------------------------------===//
// Lexer
//===----------------------------------------------------------------------===//
// The lexer returns tokens [0-255] if it is an unknown character, otherwise one
// of these for known things.
// 字符流解析词元规则,要么是如下5种类型,要么返回对应的ASCII值
enum Token {
tok_eof = -1,
// commands
tok_def = -2, tok_extern = -3,
// primary
tok_identifier = -4, tok_number = -5
};
static std::string IdentifierStr; // Filled in if tok_identifier
static double NumVal; // Filled in if tok_number
遍历字符流 && 输出词元
/// gettok - Return the next token from standard input.
static int gettok() {
static int LastChar = ' ';
// Skip any whitespace.
/// 忽略空格
while (isspace(LastChar))
LastChar = getchar();
/// 判定是否是identifier 满足正则条件[a-zA-Z][a-zA-Z0-9]*
if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9]*
IdentifierStr = LastChar;
while (isalnum((LastChar = getchar())))
IdentifierStr += LastChar;
/// 排除保留的关键字
if (IdentifierStr == "def") return tok_def;
if (IdentifierStr == "extern") return tok_extern;
return tok_identifier;
}
/// 判定是否是数字 满足正则条件 [0-9.]+
if (isdigit(LastChar) || LastChar == '.') { // Number: [0-9.]+
std::string NumStr;
do {
NumStr += LastChar;
LastChar = getchar();
} while (isdigit(LastChar) || LastChar == '.');
NumVal = strtod(NumStr.c_str(), 0);
return tok_number;
}
/// 判定是否是注释
if (LastChar == '#') {
// Comment until end of line.
do LastChar = getchar();
while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
if (LastChar != EOF)
return gettok();
}
// Check for end of file. Don't eat the EOF.
if (LastChar == EOF)
return tok_eof;
// Otherwise, just return the character as its ascii value.
/// 返回字符的ascii值
int ThisChar = LastChar;
LastChar = getchar();
return ThisChar;
}
理论
正则表达式
正则表达式(Regular Expression,RE)
是一种用来描述正则语言的更紧凑的表达方式。
每个正则表达式 r
定义(表示)一个语言,记为 L(r)
。这个语言也是根据 r
的子表达式所表示的语言递归定义的。
Σ 是给定的有限字符集
ε 是空串(empty string)
归纳基础:
-
ε
是一个正则表达式,L(ε) = {ε}
,即该语言只包含空串 - 如果
a ∈ Σ
,那么a
是一个正则表达式,且L(a) = {a}
。即这个语言仅包含一个长度为1的符号串a
。
归纳步骤:假设 r 和 s 都是正则表达式,表示的语言分别是L®和L(s)
-
r|s
是一个正则表达式,L(r|s) = L(r)∪L(s)
-
rs
(r
和s
的连接)是一个正则表达式,L(rs)=L(r)L(s)
-
r*
是一个正则表达式,L(r*) = (L(r))*
-
(r)
是一个正则表达式,L((r)) = L(r)
,表明表达式的两边加上括号并不影响表达式所表示的语言
运算符的优先级: *
, 连接 , |
*
号代表字符可以不出现,也可以出现一次或者多次
有穷自动机
正则表达式描述的规则人容易理解,但是要解析字符串,还需要将其转化为计算机程序能理解的模型。
有穷自动机(Finite Automata,FA)
是对一类处理系统建立的数学模型。这类系统具有一系列离散的输入输出信息和有穷数目的内部状态。
数学表达:
M = (S,Σ,δ,s0,F)
S
: 有穷状态集Σ
: 字符表δ
: 转换函数s0
: 初始状态F
: 结束/可接受状态集
表示方法
有穷自动机可以用转换图来表示。
- 初始状态(开始状态):只有一个,有
start
箭头指向 - 终止状态/可接受状态:可以有多个,用双圈表示
分类
- 确定的有穷自动机(DFA)
- 不确定的有穷自动机(NFA)
非确定有穷自动机NFA 和确定的有穷自动机DFA 唯一的区别是:从状态 s 出发,能到达的状态可能有多个。(并不是唯一确定的)
因此,转换函数为集合,而不是元素。
DFA
NFA
对应的正则表达式是 (a|b)*abb
从正则表达式到有穷自动机
因为NFA
同一输入导出多种状态,判定接受比较麻烦,因此,一般会将NFA
转换为DFA
来实现
简而言之,要做的就是 RE
-> NFA
-> DFA
Thompson 算法 (RE
到 NFA
)
正则表达式是递归定义的,通过
不停的分解子表达式,即可求得最终的 NFA。
比如
r=(a|b)*abb
对应的 NFA
子集构造法 (NFA
到 DFA
)
子集构造法的基本思想是让构造得到的DFA
的每个状态对应于NFA
的一个状态集合
正则表达式 (a|b)*abb
状态0 记为 s0
s0 输入 a 可到达状态{0,1} 记为s1
s0 -> a = {0,1} = s1
s0 输入 b 可到到状态{0} 即s0
s0 -> b = {0} = s0
s1 输入 a
当处于状态0时,可到达{0,1}
当处于状态1时,可到达∅ 两者取并集
可到达状态{0,1} 即s1
(s1,a) -> {0,1} = s1
s1 输入 b 可到达状态{0,2} 记为s2
(s1,b) -> {0,2} = s2
s2 输入 a
当处于状态0时,可到达{0,1}
当处于状态2时,可到达∅,两者取并集
可到达状态{0,1} 即s1
s2 输入 a 可到达{0,1}
(s2,a) -> {0,1} = s1
s2 输入 b
当处于状态0时,可到达{0}
当处于状态2时,可到达{3},两者取并集
可到达状态{0,3},记为s3
(s2,b) -> {0,3} = s3
s3 输入 a
0 -> {0,1}
3 -> ∅
{0,1} U ∅ => {0,1} = s1
(s3,a) -> {0,1} = s1
s3 输入 b
0 -> {0}
3 -> ∅
{0} U ∅ => {0} = s0
(s3,b) -> s0
到s3继续输入a|b,回到s0/s1,没有新的状态,再输入只会重新执行上面的过程,因此构造结束,包含状态3的是终止节点
对上面的过程进行梳理可得
s0 是初始状态
s0 输入 b 到 s0 状态
s0 输入 a 到 s1 状态
s1 输入 a 到 s1 状态
s1 输入 b 到 s2 状态
s2 输入 a 到 s1 状态
s2 输入 b 到 s3 状态
s3 输入 a 到 s1 状态
s3 输入 b 到 s0 状态
整个思考过程处理成表格的形式
状态集\条件 | a | b |
---|---|---|
0 | {0,1} => s1 | {0} => s0 |
1 | {0,1} => s1 | {2} => s2 |
2 | {0,1} => s1 | {0,3} => s3 |
3 | {0,1} => s1 | {0} => s0 |
另外一种情况 NFA
中包含 ε
边比如参考链接9中举的例子
正则表达式为 a (b|c)*
字符集为{a,b,c}
设s0 = n0
s0 输入a
可到达状态{n1} ``ε`` 不消耗代价,可以略过,因此实际可到达为 {n1,n2,n3,n4,n6,n9} 记为 s1
状态s0 通过ε转换到达的NFA状态集合
ε.closure(s0) = {n1,n2,n3,n4,n6,n9} = s1 终止状态(包含了n9)
观察上面的图+思考比较容易判断出,哪些情况下输入b,c得到的结果不会是∅ ,即有效的
s1 输入b
ε.closure(Move(s1,b)) = {n5,n8,n9,n3,n4,n6} = s2 终止状态
s1 输入c
ε.closure(Move(s1,c)) = {n7,n8,n9,n3,n4,n6} = s3 终止状态
s2 输入b
ε.closure(Move(s2,b)) = {n5,n8,n9,n3,n4,n6} = s2
s2 输入 c
ε.closure(Move(s2,c)) = {n7,n8,n9,n3,n4,n6} = s3
s3 输入 b
ε.closure(Move(s3,b)) = {n5,n8,n9,n3,n4,n6} = s2
s3 输入 c
ε.closure(Move(s3,c)) = {n5,n8,n9,n3,n4,n6} = s3
状态集\条件 | a | b | c |
---|---|---|---|
s0 | {n1,n2,n3,n4,n6,n9} => s1 | ∅ | ∅ |
s1 | ∅ | {n5,n8,n9,n3,n4,n6} => s2 | {n7,n8,n9,n3,n4,n6} => s3 |
s2 | ∅ | {n5,n8,n9,n3,n4,n6} => s2 | {n7,n8,n9,n3,n4,n6} => s3 |
s3 | ∅ | {n5,n8,n9,n3,n4,n6} => s2 | {n7,n8,n9,n3,n4,n6} => s3 |
/// 子集构造算法: 工作表算法
/// 初始状态
s0 <- eps_closure(n0)
Q <- {s0}
/// 将初始状态添加到工作表中
workList <- s0
while (workList != [])
/// 将该状态添加到
remove s from workList
foreach (character c)
/**
1.遍历状态集s在输入条件c下得到的状态集合
2.通过e-closure扩展边界,然后取并集,得到最终的状态集
*/
t <- e-closure(delta(s,c))
D[s,c] <- t
/**
判断标准: 经过e-closure 转换,发现新的状态,继续进行子集构造操作,
否则证明这条分支的转换结束,继续判断工作列表中是否为空
*/
if (t not in Q)
add t to Q and workList
因为有限集合的子集是有限的,N
个元素的集合,最多有2^N
个子集,每次遍历都会将状态加入到Q
中,所以循环最终会结束。
DFA
的状态数有可能是NFA
状态数的指数,在这种情况下,我们在试图实现这个DFA
时会遇到困难。然而,基于自动机的词法分析方法的处理部分源于如下事实:对于一个真实的语言,它的NFA
和DFA
的状态数量大致相同,状态数量呈指数关系的情形尚未在实践中出现过
— 《编译原理》
Hopcroft 算法 (最小化 DFA
)
经过前面两步,已经将 RE
,转换成DFA
,但是还需要进行最小化的操作
split(s)
foreach(character c)
/**
输入字符c到状态集合s中,每种状态输出的状态集存在不一致的情况,证明字符c可以切割这个状态集合s
*/
if (c can split s)
split s into T1,...,Tk
hopcroft()
/**
首次划分: N非结束状态,A结束状态
*/
split all nodes into N,A
while (set is still changes)
split(s)
将所有终止状态合并为状态s4
该状态s4
接收字符b
,c
用字符b尝试切割
状态1 输入 b 得到状态2 在状态s4中
状态2 输入 b 得到状态2 在状态s4中
状态3 输入 b 空集 忽略
无法通过字符b找到一个不属于状态s4的状态
对字符c同理,因此可以合并
Clang
中的实现
要解决的问题是: Clang
是如何进行词法分析的?
如何使用lldb
调试Clang
工程
在上面的实验分析,发现终端在执行clang
指令时,最终会传递到CompilerInvocation.cpp
这个文件的ParseFrontendArgs
方法中。
指令的完成后,进程就结束了,并不像iOS
应用会有运行循环。
因此在CompilerInvocation.cpp
文件中
引入 #include <unistd.h> 头文件
添加延时的操作
case OPT_dump_tokens:
Opts.ProgramAction = frontend::DumpTokens;
sleep(20);
break;
重新
sudo ninja -j4 install
窗口A
执行
## 定位到工程目录中
/opt/llvm/bin/clang-8 -fmodules -E -Xclang -dump-tokens main.mm
窗口B
执行
~ % ps -ef | grep clang
## 会有类似下面的输出
% ps -ef | grep clang-8
501 497 3105 0 11:11AM ttys000 0:00.01 grep --color=auto --exclude-dir=.bzr --exclude-dir=CVS --exclude-dir=.git --exclude-dir=.hg --exclude-dir=.svn clang-8
501 384 13315 0 11:11AM ttys001 0:00.06 /opt/llvm/bin/clang-8 -fmodules -E -Xclang -dump-tokens main.mm
501 392 384 0 11:11AM ttys001 0:00.01 /opt/llvm/bin/clang-8 -cc1 -triple x86_64-apple-macosx10.13.0 -Wdeprecated-objc-isa-usage -Werror=deprecated-objc-isa-usage -E -disable-free -disable-llvm-verifier -discard-value-names -main-file-name main.mm -mrelocation-model pic -pic-level 2 -mthread-model posix -mdisable-fp-elim -masm-verbose -munwind-tables -target-cpu penryn -dwarf-column-info -debugger-tuning=lldb -ggnu-pubnames -target-linker-version 409.12 -resource-dir /opt/llvm/lib/clang/8.0.0 -stdlib=libc++ -internal-isystem /opt/llvm/include/c++/v1 -fdeprecated-macro -fdebug-compilation-dir /Users/Jason/Desktop/TryLLVM/prj/LLVM_02/LLVM_02 -ferror-limit 19 -fmessage-length 158 -stack-protector 1 -fblocks -fencode-extended-block-signature -fmodules -fimplicit-module-maps -fmodules-cache-path=/var/folders/lk/znn1qp4925j412wt4t_qkplc0000gn/C/org.llvm.clang.Jason/ModuleCache -fmodules-validate-system-headers -fregister-global-dtors-with-atexit -fobjc-runtime=macosx-10.13.0 -fobjc-exceptions -fcxx-exceptions -fexceptions -fmax-type-align=16 -fdiagnostics-show-option -fcolor-diagnostics -dump-tokens -o - -x objective-c++ main.mm
窗口C
执行
## 进入lldb 交互环境
~% lldb
## attach到对应的进程上 选带cc1的那个进程
~% process attach --pid 392
## 查看调用栈
~% bt all
查看源代码,经过整理后,大致是这样的
## CompilerInvocation.cpp
CompilerInvocation::CreateFromArgs 调用
ParseFrontendArgs 调用
## FrontendAction.cpp
DumpTokensAction::ExecuteAction
void DumpTokensAction::ExecuteAction() {
Preprocessor &PP = getCompilerInstance().getPreprocessor();
// Start preprocessing the specified input file.
Token Tok;
PP.EnterMainSourceFile();
do {
PP.Lex(Tok);
PP.DumpToken(Tok, true);
llvm::errs() << "\n";
} while (Tok.isNot(tok::eof));
}
void Preprocessor::Lex(Token &Result) {
// We loop here until a lex function returns a token; this avoids recursion.
bool ReturnedToken;
do {
switch (CurLexerKind) {
case CLK_Lexer:
ReturnedToken = CurLexer->Lex(Result);
break;
case CLK_TokenLexer:
ReturnedToken = CurTokenLexer->Lex(Result);
break;
case CLK_CachingLexer:
CachingLex(Result);
ReturnedToken = true;
break;
case CLK_LexAfterModuleImport:
LexAfterModuleImport(Result);
ReturnedToken = true;
break;
}
} while (!ReturnedToken);
if (Result.is(tok::code_completion) && Result.getIdentifierInfo()) {
// Remember the identifier before code completion token.
setCodeCompletionIdentifierInfo(Result.getIdentifierInfo());
setCodeCompletionTokenRange(Result.getLocation(), Result.getEndLoc());
// Set IdenfitierInfo to null to avoid confusing code that handles both
// identifiers and completion tokens.
Result.setIdentifierInfo(nullptr);
}
LastTokenWasAt = Result.is(tok::at);
}
void Preprocessor::DumpToken(const Token &Tok, bool DumpFlags) const {
llvm::errs() << tok::getTokenName(Tok.getKind()) << " '"
<< getSpelling(Tok) << "'";
if (!DumpFlags) return;
llvm::errs() << "\t";
if (Tok.isAtStartOfLine())
llvm::errs() << " [StartOfLine]";
if (Tok.hasLeadingSpace())
llvm::errs() << " [LeadingSpace]";
if (Tok.isExpandDisabled())
llvm::errs() << " [ExpandDisabled]";
if (Tok.needsCleaning()) {
const char *Start = SourceMgr.getCharacterData(Tok.getLocation());
llvm::errs() << " [UnClean='" << StringRef(Start, Tok.getLength())
<< "']";
}
llvm::errs() << "\tLoc=<";
DumpLocation(Tok.getLocation());
llvm::errs() << ">";
}
在CompilerInvocation::CreateFromArgs
方法上也做类似延时的操作,然后设置断点
(lldb) br set --name ParseFrontendArgs
参数Token
的结构如下
namespace clang {
class IdentifierInfo;
/// Token - This structure provides full information about a lexed token.
/// It is not intended to be space efficient, it is intended to return as much
/// information as possible about each returned token. This is expected to be
/// compressed into a smaller form if memory footprint is important.
///
/// The parser can create a special "annotation token" representing a stream of
/// tokens that were parsed and semantically resolved, e.g.: "foo::MyClass<int>"
/// can be represented by a single typename annotation token that carries
/// information about the SourceRange of the tokens and the type object.
class Token {
/// The location of the token. This is actually a SourceLocation.
unsigned Loc;
// Conceptually these next two fields could be in a union. However, this
// causes gcc 4.2 to pessimize LexTokenInternal, a very performance critical
// routine. Keeping as separate members with casts until a more beautiful fix
// presents itself.
/// UintData - This holds either the length of the token text, when
/// a normal token, or the end of the SourceRange when an annotation
/// token.
/// token 的长度
unsigned UintData;
/// PtrData - This is a union of four different pointer types, which depends
/// on what type of token this is:
/// Identifiers, keywords, etc:
/// This is an IdentifierInfo*, which contains the uniqued identifier
/// spelling.
/// Literals: isLiteral() returns true.
/// This is a pointer to the start of the token in a text buffer, which
/// may be dirty (have trigraphs / escaped newlines).
/// Annotations (resolved type names, C++ scopes, etc): isAnnotation().
/// This is a pointer to sema-specific data for the annotation token.
/// Eof:
// This is a pointer to a Decl.
/// Other:
/// This is null.
/// token的内容
void *PtrData;
/// Kind - The actual flavor of token this is.
/// token的类型
tok::TokenKind Kind;
/// Flags - Bits we track about this token, members of the TokenFlags enum.
unsigned short Flags;
public:
...
tok::TokenKind
的声明
TokenKinds.h
namespace tok {
/// Provides a simple uniform namespace for tokens from all C languages.
enum TokenKind : unsigned short {
#define TOK(X) X,
#include "clang/Basic/TokenKinds.def"
NUM_TOKENS
};
/// Provides a namespace for preprocessor keywords which start with a
/// '#' at the beginning of the line.
enum PPKeywordKind {
#define PPKEYWORD(X) pp_##X,
#include "clang/Basic/TokenKinds.def"
NUM_PP_KEYWORDS
};
/// Provides a namespace for Objective-C keywords which start with
/// an '@'.
enum ObjCKeywordKind {
#define OBJC_AT_KEYWORD(X) objc_##X,
#include "clang/Basic/TokenKinds.def"
NUM_OBJC_KEYWORDS
};
// clang/Basic/TokenKinds.def
...
// C99 6.4.2: Identifiers.
TOK(identifier) // abcde123
TOK(raw_identifier) // Used only in raw lexing mode.
// C99 6.4.4.1: Integer Constants
// C99 6.4.4.2: Floating Constants
TOK(numeric_constant) // 0x123
// C99 6.4.4: Character Constants
TOK(char_constant) // 'a'
TOK(wide_char_constant) // L'b'
// C++17 Character Constants
TOK(utf8_char_constant) // u8'a'
// C++11 Character Constants
TOK(utf16_char_constant) // u'a'
TOK(utf32_char_constant) // U'a'
// C99 6.4.5: String Literals.
TOK(string_literal) // "foo"
TOK(wide_string_literal) // L"foo"
TOK(angle_string_literal)// <foo>
// C++11 String Literals.
TOK(utf8_string_literal) // u8"foo"
TOK(utf16_string_literal)// u"foo"
TOK(utf32_string_literal)// U"foo"
// C99 6.4.6: Punctuators.
PUNCTUATOR(l_square, "[")
PUNCTUATOR(r_square, "]")
PUNCTUATOR(l_paren, "(")
PUNCTUATOR(r_paren, ")")
PUNCTUATOR(l_brace, "{")
PUNCTUATOR(r_brace, "}")
PUNCTUATOR(period, ".")
PUNCTUATOR(ellipsis, "...")
PUNCTUATOR(amp, "&")
PUNCTUATOR(ampamp, "&&")
PUNCTUATOR(ampequal, "&=")
PUNCTUATOR(star, "*")
PUNCTUATOR(starequal, "*=")
PUNCTUATOR(plus, "+")
PUNCTUATOR(plusplus, "++")
PUNCTUATOR(plusequal, "+=")
...
/// Lexer.cpp
/// LexTokenInternal - This implements a simple C family lexer. It is an
/// extremely performance critical piece of code. This assumes that the buffer
/// has a null character at the end of the file. This returns a preprocessing
/// token, not a normal token, as such, it is an internal interface. It assumes
/// that the Flags of result have been cleared before calling this.
/// 词法分析的方法
bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine)
...
// Read a character, advancing over it.
char Char = getAndAdvanceChar(CurPtr, Result);
tok::TokenKind Kind;
/// 遍历字符
switch (Char) {
...
case '\n':
// If we are inside a preprocessor directive and we see the end of line,
// we know we are done with the directive, so return an EOD token.
if (ParsingPreprocessorDirective) {
// Done parsing the "line".
ParsingPreprocessorDirective = false;
// Restore comment saving mode, in case it was disabled for directive.
if (PP)
resetExtendedTokenMode();
// Since we consumed a newline, we are back at the start of a line.
IsAtStartOfLine = true;
IsAtPhysicalStartOfLine = true;
/// 比如\n类型就判断tok::end类型
Kind = tok::eod;
break;
}
...
// C99 6.4.4.1: Integer Constants.
// C99 6.4.4.2: Floating Constants.
// 整形常量
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
return LexNumericConstant(Result, CurPtr);
...
// C99 6.4.2: Identifiers. 是不是标识符
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
case 'V': case 'W': case 'X': case 'Y': case 'Z':
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
case 'v': case 'w': case 'x': case 'y': case 'z':
case '_':
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
return LexIdentifier(Result, CurPtr);
/// LexNumericConstant - Lex the remainder of a integer or floating point
/// constant. From[-1] is the first character lexed. Return the end of the
/// constant.
bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
...
/// 类型设置为 tok::numeric_constant
FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
...
/// 判断是否是标识符
bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
// Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
unsigned Size;
unsigned char C = *CurPtr++;
...