前言
这周公司新上的项目需要压测,根据各个压测场景,需要拿到linux服务器不同的系统消耗指标。
思来想去觉得还是使用python更轻量,也更容易被后续的第三方agent来执行,就写了这样的一个指标采集工具。
指标采集
指标包括cpu、内存、io、网卡等一系列常见的性能指标,具体的指标以及计算也可以参考github上的淘宝开源项目tsar
整体的采集思路非常简单,分为两种:
- 读取特定的文件,解析文件,格式化数据;
- 执行指定命令,获取输出,格式化数据
所有的指标都乘以了一个系数,我贪快,所以全都直接写的10000 :(
具体的数据解析可以自行cat输出对应的文件,结合命令输出来对比
1.负载
从/proc/loadavg文件中读取
def collector_load():
# 读取负载文件
load_file = open("/proc/loadavg")
content = load_file.read().split()
load_file.close()
load_avg = {
"load1": int(string.atof(content[0]) * 10000),
"load5": int(string.atof(content[1]) * 10000),
"load15": int(string.atof(content[2]) * 10000)
}
return load_avg
2. 内存
从/proc/meminfo中读取
# 采集内存信息
def collect_memory_info():
# 读取内存信息文件
memory_buffer = {}
with open("/proc/meminfo") as mem_file:
for line in mem_file:
memory_buffer[line.split(':')[0]] = string.atoi(line.split(':')[1].split()[0])
# 过滤只取关注的指标
mem_total = memory_buffer["MemTotal"]
mem_free = memory_buffer["MemFree"] + memory_buffer["Buffers"] + memory_buffer["Cached"]
mem_util = int((float(mem_total - mem_free)/float(mem_total)) * 10000)
mem_buff = int(float(memory_buffer["Buffers"])/float(mem_total) * 10000)
mem_cache = int(float(memory_buffer["Cached"])/float(mem_total) * 10000)
mem_info = {
"mem_buff": mem_buff,
"mem_util": mem_util,
"mem_cache": mem_cache,
}
return mem_info
3. cpu信息
从/proc/stat中获取
# 采集cpu信息
def collect_cpu_info():
cpu_buffer = {}
with open("/proc/stat") as cpu_file:
for line in cpu_file:
line_fields = line.split()
if line_fields[0] == "cpu":
total = 0
for field in line_fields:
if field == "cpu":
continue
total += string.atoi(field)
cpu_buffer = {
"User": string.atoi(line_fields[1]),
"Sys": string.atoi(line_fields[3]),
"Idle": string.atoi(line_fields[4]),
"Steal": string.atoi(line_fields[8]),
"Wait": string.atoi(line_fields[5]),
"Total": total
}
break
return cpu_buffer
这个指标在系统中是累加的,因此需要再次进行计算,即本次结果与上次结果的差值才是本段时间内的指标值:
# 计算cpu数据
def calculate_cpu_info():
global last_cpu_info
cpu_info = collect_cpu_info()
if last_cpu_info is None:
last_cpu_info = cpu_info
return {}
else:
delta_total = cpu_info["Total"] - last_cpu_info["Total"]
delta_user = cpu_info["User"] - last_cpu_info["User"]
delta_sys = cpu_info["Sys"] - last_cpu_info["Sys"]
delta_idle = cpu_info["Idle"] - last_cpu_info["Idle"]
delta_wait = cpu_info["Wait"] - last_cpu_info["Wait"]
delta_steal = cpu_info["Steal"] - last_cpu_info["Steal"]
last_cpu_info = cpu_info
return {
"cpu_user": int(float(delta_user)/float(delta_total) * 10000),
"cpu_sys": int(float(delta_sys)/float(delta_total) * 10000),
"cpu_wait": int(float(delta_wait)/float(delta_total) * 10000),
"cpu_steal": int(float(delta_steal)/float(delta_total) * 10000),
"cpu_idle": int(float(delta_idle)/float(delta_total) * 10000),
"cpu_util": int(float(delta_total - delta_idle - delta_wait - delta_steal)/float(delta_total) * 10000)
}
4. IO相关
从文件/proc/diskstats中读取
# 采集io
def collect_io_info():
io_buffer = {}
with open("/proc/diskstats") as io_file:
for line in io_file:
line_fields = line.split()
device_name = line_fields[2]
if line_fields[3] == "0":
continue
if should_handle_device(device_name):
io_buffer[device_name] = {
"ReadRequest": string.atoi(line_fields[3]),
"WriteRequest": string.atoi(line_fields[7]),
"MsecRead": string.atoi(line_fields[6]),
"MsecWrite": string.atoi(line_fields[10]),
"MsecTotal": string.atoi(line_fields[12]),
"Timestamp": int(time.time())
}
return io_buffer
# 当前的硬盘设备是否需要使用
def should_handle_device(device):
normal = len(device) == 3 and device.startswith("sd") or device.startswith("vd")
aws = len(device) >= 4 and device.startswith("xvd") or device.startswith("sda")
return normal or aws
这个指标也是累加的,需要进行求差:
# 计算io信息
def calculate_io_info():
global last_io_info
io_info = collect_io_info()
result = []
if last_io_info is not None:
for key in io_info.keys():
total_duration = io_info[key]["Timestamp"] - last_io_info[key]["Timestamp"]
read_use_io = io_info[key]["MsecRead"] - last_io_info[key]["MsecRead"]
write_use_io = io_info[key]["MsecWrite"] - last_io_info[key]["MsecWrite"]
read_io = io_info[key]["ReadRequest"] - last_io_info[key]["ReadRequest"]
write_io = io_info[key]["WriteRequest"] - last_io_info[key]["WriteRequest"]
read_write_io = io_info[key]["MsecTotal"] - last_io_info[key]["MsecTotal"]
readwrite_io = read_io + write_io
io_awit = 0
if readwrite_io > 0:
io_awit = int(float(read_use_io + write_use_io) / float(readwrite_io) * 10000)
result.append({
"io_rs": int((read_io/total_duration) * 10000),
"io_ws": int((write_io/total_duration) * 10000),
"io_await": io_awit,
"io_util": int(float(read_write_io) / (total_duration * 1000) * 10000),
})
last_io_info = io_info
return result
5. 采集网卡
网卡数据从/proc/net/dev中读取
# 采集网卡流量数据
def collect_net_info():
net_buffer = {}
with open("/proc/net/dev") as net_file:
for line in net_file:
if line.find(":") < 0:
continue
card_name = line.split(":")[0].strip()
if should_collect_card(card_name):
line_fields = line.split(":")[1].lstrip().split()
net_buffer[card_name] = {
"InBytes": string.atoi(line_fields[0]),
"InPackets": string.atoi(line_fields[1]),
"InErrors": string.atoi(line_fields[2]),
"InDrops": string.atoi(line_fields[3]),
"OutBytes": string.atoi(line_fields[8]),
"OutPackets": string.atoi(line_fields[9]),
"OutErrors": string.atoi(line_fields[10]),
"OutDrops": string.atoi(line_fields[11])
}
return net_buffer
# 是否需要采集相应的网卡
def should_collect_card(line):
return line.startswith("eth") or line.startswith("em")
网卡指标也是一个累加值,需要求差:
# 计算网卡的指标
def calculate_net_info():
global last_net_info
net_info = collect_net_info()
result = []
if last_net_info is not None:
for key in net_info.keys():
result.append({
"in_bytes": (net_info[key]["InBytes"] - last_net_info[key]["InBytes"]) * 10000,
"in_packets": (net_info[key]["InPackets"] - last_net_info[key]["InPackets"]) * 10000,
"in_errors": (net_info[key]["InErrors"] - last_net_info[key]["InErrors"]) * 10000,
"in_drops": (net_info[key]["InDrops"] - last_net_info[key]["InDrops"]) * 10000,
"out_bytes": (net_info[key]["OutBytes"] - last_net_info[key]["OutBytes"]) * 10000,
"out_packets": (net_info[key]["OutPackets"] - last_net_info[key]["OutPackets"]) * 10000,
"out_errors": (net_info[key]["OutErrors"] - last_net_info[key]["OutErrors"]) * 10000,
"out_drops": (net_info[key]["OutDrops"] - last_net_info[key]["OutDrops"]) * 10000
})
last_net_info = net_info
return result
6. 采集tcp指标
tcp与udp的指标信息都可以从/proc/net/snmp中读取
# 采集tcp相关数据
def collect_tcp_info():
tcp_buffer = {}
is_title = True
with open("/proc/net/snmp") as tcp_file:
for line in tcp_file:
protocol_name = line.split(":")[0].strip()
if protocol_name == "Tcp":
if is_title:
is_title = False
continue
else:
line_fields = line.split(":")[1].lstrip().split()
tcp_buffer = {
"ActiveOpens": string.atoi(line_fields[4]),
"PassiveOpens": string.atoi(line_fields[5]),
"InSegs": string.atoi(line_fields[9]),
"OutSegs": string.atoi(line_fields[10]),
"RetransSegs": string.atoi(line_fields[11]),
"CurrEstab": string.atoi(line_fields[8]),
}
break
return tcp_buffer
里面有累加值也有实时值,当前的连接数为实时值:
# 计算tcp数据
def calculate_tcp_info():
global last_tcp_info
tcp_info = collect_tcp_info()
result = {}
if last_tcp_info is not None:
outSegsTcp = tcp_info["OutSegs"] - last_tcp_info["OutSegs"]
retransRate = float(tcp_info["RetransSegs"] - last_tcp_info["RetransSegs"])/float(outSegsTcp)
result = {
"tcp_active": (tcp_info["ActiveOpens"] - last_tcp_info["ActiveOpens"]) * 10000,
"tcp_passive": (tcp_info["PassiveOpens"] - last_tcp_info["PassiveOpens"]) * 10000,
"tcp_inseg": (tcp_info["InSegs"] - last_tcp_info["InSegs"]) * 10000,
"tcp_outseg": outSegsTcp * 10000,
"tcp_established": tcp_info["CurrEstab"] * 10000,
"tcp_retran": int(retransRate * 10000)
}
last_tcp_info = tcp_info
return result
7. 采集指定进程的cpu与内存
有两种方式,其一是执行ps命令,取到的是当前进程启动之后的平均cpu与内存占用;其二是在proc/pid下面读取,在这里用的是第一种。
指定的进程的名称通过ps auxc | grep "进程名1|进程名2|...."来获取进程id
# 采集指定进程数据
def collect_process_info():
global processes
process_info = {}
if processes == "":
return process_info
process_filter = processes.replace(",", "\|")
process_filter = "'" + process_filter + "'"
commandline = "ps auxc | grep " + process_filter
status_code, result = commands.getstatusoutput(commandline)
if status_code == 0:
# 分割结果
result_array = result.split("\n")
for item in result_array:
item_fields = item.split()
process_info[item_fields[10]] = {
"process_cpu_util": int(string.atof(item_fields[2]) * 10000),
"process_mem_util": int(string.atof(item_fields[3]) * 10000)
}
return process_info
如果需实时的数据,应该从proce/pid中的文件夹去读取数据,拿pid的方式和上述的方式是一样的