1. 前言
在近期的科研过程中需要利用objdump对二进制软件进行一个静态分析,以提取该二进制可执行程序中调用的C标准库函数名,并得到函数名的索引在后续的动态分析中使用。coding过程中用到了正则表达式。
2. 正则表达式
正则表达式是在字符串处理中的常用工具,主要用于字符串的匹配。
1)例子
const string FUNC_LINE("<([a-z]+)@"); //设置匹配字符串
regex reg(FUNC_LINE); //设置匹配正则表达式
smatch f; //存放匹配到的字符串
regex_search(line, f, reg); //搜索匹配(匹配子字符串)
string s = f[1].str(); //从搜索出的子字符串中找出匹配[a-z]+的子串
代码中"<([a-z]+)@",匹配以<开头的字符串,[a-z]+匹配多个字母字符,以@符号结尾。f[]数组表示匹配得出的子字符串:f[0]为整个"<([a-z]+)@"匹配的字符串;f[1]为"([a-z]+)"大子串中匹配第一个()中格式的子串。
例如:f[0]匹配得到"<printf@",则f[1]匹配得到"printf"。
2)regex_match函数
match是全文匹配,即要求整个字符串符合匹配规则。匹配成功返回true。
cout << regex_match("<<printf@", regex("<([a-z]+)@")) << endl; //输出0
cout << regex_match("<printf@", regex("<([a-z]+)@")) << endl; //输出1
3)regex_search函数
search是搜索匹配,即搜索字符串中存在符合规则的子字符串。匹配成功返回true。
cout << regex_match("<<printf@", regex("<([a-z]+)@")) << endl; //regex_match输出0
cout << regex_search("<<printf@", regex("<([a-z]+)@")) << endl; //regex_search输出1
3. 实例
/*
* author: kanxiao
* date: 2020/06/16
* 利用objdump静态分析可执行文件中调用的C标准库函数
* 在libc_desc中的索引存入set
*
*/
#include <iostream>
#include <fstream>
#include <string>
#include <regex>
#include <unordered_map>
#include <set>
#include <unistd.h>
#include <fcntl.h>
#include <sys/wait.h>
using namespace std;
/* 建立哈希表,存储函数名与索引号的映射关系 */
const unordered_map<string, unsigned> libc_map = {
{"isalnum", 0}, {"isalpha", 1}, {"iscntrl", 2}, {"isdigit", 3}, {"isgraph", 4}, {"islower", 5}, {"isprint", 6}, {"ispunct", 7}, {"isspace", 8},
{"isupper", 9}, {"isxdigit", 10}, {"isctype", 11}, {"tolower", 12}, {"toupper", 13}, {"memcpy", 14}, {"memset", 15}, {"memchr", 16}, {"memcmp", 17},
{"strcat", 18}, {"strncat", 19}, {"strchr", 20}, {"strcmp", 21}, {"strncmp", 22}, {"strcoll", 23}, {"strcpy", 24}, {"strncpy", 25}, {"strcspn", 26},
{"strlen", 27}, {"strpbrk", 28}, {"strspn", 29}, {"strstr", 30}, {"strtok", 31}, {"strxfrm", 32}, {"atoi", 33}, {"calloc", 34}, {"realloc", 35},
{"malloc", 36}, {"abort", 37}, {"getenv", 38}, {"qsort", 39}, {"bsearch", 40}, {"abs", 41}, {"labs", 42}, {"rand", 43}, {"srand", 44}, {"free",45},
{"asctime", 46}, {"clock", 47}, {"time", 48}, {"ctime", 49}, {"gmtime", 50}, {"localtime", 51}, {"mktime", 52}, {"strftime", 53}, {"fclose", 54},
{"fopen", 55}, {"printf", 56}, {"fread", 57}, {"fwrite", 58}, {"puts", 59}, {"gets", 60}, {"fputs", 61}, {"fgets", 62}
};
const string FUNC_LINE("<([a-z]+)@");
//set for libc function index in libc_desc
set<unsigned> libc_idx;
int main(int argc, char *argv[])
{
int rc = fork();
if(rc < 0)
{
cerr << "fork failed!" << endl;
exit(1);
}
//子进程
else if(rc == 0)
{
/*重定向stdout到objdump.txt*/
close(1);
int fd = open("objdump.txt", O_WRONLY | O_TRUNC);
if(fd < 0)
{
cerr << "open failed!" << endl;
exit(1);
}
char *myargs[4];
myargs[0] = "objdump";
myargs[1] = "-d";
myargs[2] = argv[1];
myargs[3] = NULL;
if(execvp(myargs[0], myargs) < 0)
{
cerr << "execvp failed" << endl;
exit(1);
}
close(fd);
}
//进程本身
else
{
wait(NULL); //保证子进程先运行结束
ifstream f("./objdump.txt");
if(!f)
{
cerr << "could not open objdump file" << endl;
exit(1);
}
//正则表达式匹配
string line;
regex r(FUNC_LINE);
while(getline(f, line))
{
smatch f;
if(regex_search(line, f, r))
{
if(f.size() == 2)
{
string s = f[1].str();
auto it = libc_map.find(s);
if(it != libc_map.end())
libc_idx.insert(it->second);
}
}
}
/*
for(const auto & x : libc_idx)
cout << x << '\t' << " ";
*/
for(auto it = libc_idx.cbegin(); it != libc_idx.cend(); ++it)
{
if(it == libc_idx.cbegin())
cout << *it;
else
cout << ", " << *it;
}
}
cout << endl;
return 0;
}
上述实例完成了前言部分的描述。运行结果: