C/C++文件解析

由于主要研究C和C++,对函数、变量、类型提取等还是会经常用的,但是直接抱着ANTLR4手撸Visitor还是太傻了,所以调研了一下已有的一些工具,康康有没有好的解决方案

主要介绍三个工具,分别是lizardpycparserlibclang

lizard#

lizard是一个代码分析的python包,语言包括但不限于c/cpp

java,同时除了分析源代码文件,它也有一些统计代码量、检测抄袭啥的功能。当然我只关心它的源代码分析部分,它有个比较好的优点就是对源代码的分析不依赖于头文件,所以我们不用管它的头文件是不是缺失的问题。

代码量统计#

通过命令行即可调用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
$ lizard openssl/ssl/bio_ssl.c 
================================================
NLOC CCN token PARAM length location
------------------------------------------------
4 1 10 1 4 BIO_f_ssl@51-54@openssl/ssl/bio_ssl.c
12 2 63 1 15 ssl_new@56-70@openssl/ssl/bio_ssl.c
17 5 88 1 19 ssl_free@72-90@openssl/ssl/bio_ssl.c
61 16 294 4 69 ssl_read@92-160@openssl/ssl/bio_ssl.c
54 14 273 4 61 ssl_write@162-222@openssl/ssl/bio_ssl.c
157 43 863 4 169 ssl_ctrl@224-392@openssl/ssl/bio_ssl.c
17 2 76 3 18 ssl_callback_ctrl@394-411@openssl/ssl/bio_ssl.c
7 1 39 2 8 ssl_puts@413-420@openssl/ssl/bio_ssl.c
15 4 94 1 18 BIO_new_buffer_ssl_connect@422-439@openssl/ssl/bio_ssl.c
14 4 91 1 17 BIO_new_ssl_connect@441-457@openssl/ssl/bio_ssl.c
17 4 87 2 19 BIO_new_ssl@459-477@openssl/ssl/bio_ssl.c
15 6 111 2 15 BIO_ssl_copy_session_id@479-493@openssl/ssl/bio_ssl.c
11 5 65 1 12 BIO_ssl_shutdown@495-506@openssl/ssl/bio_ssl.c
1 file analyzed.
==============================================================
NLOC Avg.NLOC AvgCCN Avg.token function_cnt file
--------------------------------------------------------------
438 30.8 8.2 165.7 13 openssl/ssl/bio_ssl.c

===========================================================================================================
!!!! Warnings (length > 1000 or cyclomatic_complexity > 15 or nloc > 1000000 or parameter_count > 100) !!!!
================================================
NLOC CCN token PARAM length location
------------------------------------------------
61 16 294 4 69 ssl_read@92-160@openssl/ssl/bio_ssl.c
157 43 863 4 169 ssl_ctrl@224-392@openssl/ssl/bio_ssl.c
==========================================================================================
Total nloc Avg.NLOC AvgCCN Avg.token Fun Cnt Warning cnt Fun Rt nloc Rt
------------------------------------------------------------------------------------------
438 30.8 8.2 165.7 13 2 0.15 0.54

函数相关#

1
2
3
4
>>> import lizard
>>> p = lizard.analyze_file("openssl/ssl/bio_ssl.c")
>>> p.__dict__
{'function_list': [<lizard.FunctionInfo object at 0x7f13f2e5ecf8>, <lizard.FunctionInfo object at 0x7f13f2e5eb00>, <lizard.FunctionInfo object at 0x7f13f2e5eeb8>, <lizard.FunctionInfo object at 0x7f13f2e5e978>, <lizard.FunctionInfo object at 0x7f13f2e5ebe0>, <lizard.FunctionInfo object at 0x7f13f2e5e9e8>, <lizard.FunctionInfo object at 0x7f13f2e5ea20>, <lizard.FunctionInfo object at 0x7f13f2e532e8>, <lizard.FunctionInfo object at 0x7f13f2e690f0>, <lizard.FunctionInfo object at 0x7f13f2e69198>, <lizard.FunctionInfo object at 0x7f13f2e69128>, <lizard.FunctionInfo object at 0x7f13f2e691d0>, <lizard.FunctionInfo object at 0x7f13f2e69160>], 'filename': 'openssl/ssl/bio_ssl.c', 'nloc': 438, 'token_count': 2364}

可以看到主要是保留了函数的信息和一些基本信息,接着看看函数信息

1
2
3
>>> f = p.function_list[0]
>>> dir(f)
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'add_parameter', 'add_to_function_name', 'add_to_long_name', 'cyclomatic_complexity', 'end_line', 'fan_in', 'fan_out', 'filename', 'full_parameters', 'general_fan_out', 'length', 'location', 'long_name', 'name', 'name_in_space', 'nloc', 'parameter_count', 'parameters', 'start_line', 'token_count', 'top_nesting_level', 'unqualified_name']

主要还是一些很基本的信息,示例函数如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
static int ssl_new(BIO *bi)
{
BIO_SSL *bs = OPENSSL_zalloc(sizeof(*bs));

if (bs == NULL) {
ERR_raise(ERR_LIB_BIO, ERR_R_MALLOC_FAILURE);
return 0;
}
BIO_set_init(bi, 0);
BIO_set_data(bi, bs);
/* Clear all flags */
BIO_clear_flags(bi, ~0);

return 1;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
>>> f.start_line
56
>>> f.end_line
70
>>> f.location
' ssl_new@56-70@openssl/ssl/bio_ssl.c'
>>> f.name
'ssl_new'
>>> f.long_name
'ssl_new( BIO * bi)'
>>> f.parameters
['bi']
>>> f.full_parameters
['BIO * bi']

对于函数调用等的分析是没有的,这个作者也在README中说了。

pycparser#

是一个用python解析C语言的库,可定制性比lizard更高,目前主要兼容C99标准,它内置c_ast用于访问AST树,缺点是只能对一些标准的头文件进行忽略,当缺失头文件时,由于AST解析失败,会出错,需要自己更正直到产生正确的中间文件。具体可参考(https://eli.thegreenplace.net/2015/on-parsing-c-type-declarations-and-fake-headers),值得一提的是,即使正确的处理得到了中间文件,parser依然可能会处理失败。

这里我们都使用base64的代码进行体验,里面有for循环也有if语句等。

pycparser主要在AST树上做事,所以写代码的时候可以多参照c_ast.py和它的examples

访问节点示例1#

示例1主要访问函数定义、参数定义、变量定义等。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from __future__ import print_function
import sys

from pycparser import c_parser, c_ast, parse_file
from pycparser import c_generator


def extract_functions(ast):
class DeclVisitor(c_ast.NodeVisitor):
def visit_Decl(self,node):
global variables
variables.append(generator.visit(node))

class ParamListVisitor(c_ast.NodeVisitor):
def visit_ParamList(self,node):
global params
params.append(generator.visit(node))

class FuncDefVisitor(c_ast.NodeVisitor):
def visit_FuncDef(self,node):
functions[node.decl.name] = {}
functions[node.decl.name]['position'] = "%s" % node.decl.coord
param_v = ParamListVisitor()
global params
params = []
param_v.visit(node)
functions[node.decl.name]['params'] = params

variables_v = DeclVisitor()
global variables
variables = []
variables_v.visit(node.body)
functions[node.decl.name]['variables'] = variables
v = FuncDefVisitor()
v.visit(ast)
return functions

if __name__ == "__main__":
if len(sys.argv) > 1:
filename = sys.argv[1]
else:
exit(0)
functions = {}
params = []
variables = []
generator = c_generator.CGenerator()
ast = parse_file(filename,use_cpp=True,cpp_path='gcc',cpp_args=['-E','-I/root/source_code_process/cparser/utils/fake_libc_include'])
functions = extract_functions(ast)
print(functions)
'''
{'base64_encode': {'variables': ['int s', 'unsigned int i', 'unsigned int j', 'unsigned char c', 'unsigned char l'], 'params': ['const unsigned char *in, unsigned int inlen, char *out'], 'position': 'base64/base64.c:73:1'}, 'base64_decode': {'variables': ['unsigned int i', 'unsigned int j', 'unsigned char c'], 'params': ['const char *in, unsigned int inlen, unsigned char *out'], 'position': 'base64/base64.c:122:1'}}
'''

访问节点示例2#

示例2主要想知道函数调用了哪些函数,也就是想生成函数调用图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from __future__ import print_function
import sys

from pycparser import c_parser, c_ast, parse_file
from pycparser import c_generator


def extract_functions(ast):
class FuncCallVisitor(c_ast.NodeVisitor):
def visit_FuncCall(self,node):
global funccalls
funccalls.append(node.name.name)

class FuncDefVisitor(c_ast.NodeVisitor):
def visit_FuncDef(self,node):
functions[node.decl.name] = {}
functions[node.decl.name]['position'] = "%s" % node.decl.coord
func_v = FuncCallVisitor()
global funccalls
funccalls = []
func_v.visit(node)
functions[node.decl.name]['funccalls'] = funccalls

v = FuncDefVisitor()
v.visit(ast)
return functions



if __name__ == "__main__":
if len(sys.argv) > 1:
filename = sys.argv[1]
else:
exit(0)
functions = {}
funccalls = []
generator = c_generator.CGenerator()
ast = parse_file(filename,use_cpp=True,cpp_path='gcc',cpp_args=['-E','-I/root/source_code_process/cparser/utils/fake_libc_include'])
functions = extract_functions(ast)
print(functions)
'''
{'main': {'position': 'base64/main.c:30:1', 'funccalls': ['test', 'test', 'test', 'test', 'test', 'test', 'test']}, 'test': {'position': 'base64/main.c:9:1', 'funccalls': ['malloc', 'malloc', 'assert', 'assert', 'assert', 'assert', 'assert', 'assert', 'free', 'free']}}
'''

libclang#

libclang需要先装llvm,从一个纯净ubuntu的docker开始,命令如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
apt install vim gcc g++ cmake python2 python3 python3-distutils python3-pip
tar -xvf llvm-11.0.0.src.tar.xz
mv llvm-11.0.0.src llvm
tar -xvf clang-11.0.0.src.tar.xz
mv clang-11.0.0.src clang
mv clang llvm/tools/
tar -xvf compiler-rt-11.0.0.src.tar.xz
mv compiler-rt-11.0.0.src llvm/projects/compiler-rt
tar -xvf libcxx-11.0.0.src.tar.xz
mv libcxx-11.0.0.src llvm/projects/libcxx
tar -xvf libcxxabi-11.0.0.src.tar.xz
mv libcxxabi-11.0.0.src llvm/projects/libcxxabi
tar -xvf lldb-11.0.0.src.tar.xz
mv lldb-11.0.0.src llvm/tools/lldb
tar -xvf lld-11.0.0.src.tar.xz
mv lld-11.0.0.src llvm/tools/lld
tar -xvf clang-tools-extra-11.0.0.src.tar.xz
mv clang-tools-extra-11.0.0.src llvm/tools/clang/tools/extra
cd llvm
mkdir build && cd build
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release ..
make -j4
pip3 install clang==11.0

算了,先空着吧,等用到了再回来填坑

参考资料#

评论