作者:Maxwell Li
日期:2017/01/20
未经作者允许,禁止转载本文任何内容。如需转载请留言。
问题发现
近段时间 Compass4NFV 物理部署 OpenStack Newton Ubuntu 极其不稳定,各种常见的不常见的概率性问题频发,提高代码稳定性与部署成功率刻不容缓。
1月20日,发现 aodh 在进行数据库同步时失败,而且仅仅出现在 huawei-pod2 和 intel-pod8 上,huawei-pod1 上的物理部署一切顺利。另外,在慕尼黑的 huawei-pod5 部署 OpenStack Newton CentOS 也没有出现过这个问题。
JIRA单请见:COMPASS-521
问题定位
Step1 查看 ERROR log
进入控制节点,手动初始化 aodh 数据库,ERROR 如下:
root@host1:~# su -s /bin/sh -c "aodh-dbsync" aodh
2017-01-19 22:07:57.990 4179 DEBUG aodh.storage [-] looking for 'mysql' driver in 'aodh.storage' get_connection_from_config /usr/lib/python2.7/dist-packages/aodh/storage/__init__.py:68
2017-01-19 22:07:58.078 4179 DEBUG oslo_db.sqlalchemy.engines [-] MySQL server mode set to STRICT_TRANS_TABLES,STRICT_ALL_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,TRADITIONAL,NO_AUTO_CREATE_USER,NO_ENGINE_SUBSTITUTION _check_effective_sql_mode /usr/lib/python2.7/dist-packages/oslo_db/sqlalchemy/engines.py:261
2017-01-19 22:07:58.081 4179 INFO alembic.runtime.migration [-] Context impl MySQLImpl.
2017-01-19 22:07:58.082 4179 INFO alembic.runtime.migration [-] Will assume non-transactional DDL.
CRITI [aodh] DBError: (pymysql.err.InternalError) (1135, u'Can\'t create a new thread (errno 11 "Resource temporarily unavailable"); if you are not out of available memory, you can consult the manual for a possible OS-dependent bug')
Traceback (most recent call last):
File "/usr/bin/aodh-dbsync", line 10, in <module>
sys.exit(dbsync())
File "/usr/lib/python2.7/dist-packages/aodh/cmd/storage.py", line 29, in dbsync
storage.get_connection_from_config(conf).upgrade()
File "/usr/lib/python2.7/dist-packages/aodh/storage/impl_sqlalchemy.py", line 108, in upgrade
command.upgrade(cfg, "head")
File "/usr/lib/python2.7/dist-packages/alembic/command.py", line 174, in upgrade
script.run_env()
File "/usr/lib/python2.7/dist-packages/alembic/script/base.py", line 407, in run_env
util.load_python_file(self.dir, 'env.py')
File "/usr/lib/python2.7/dist-packages/alembic/util/pyfiles.py", line 93, in load_python_file
module = load_module_py(module_id, path)
File "/usr/lib/python2.7/dist-packages/alembic/util/compat.py", line 79, in load_module_py
mod = imp.load_source(module_id, path, fp)
File "/usr/lib/python2.7/dist-packages/aodh/storage/sqlalchemy/alembic/env.py", line 92, in <module>
run_migrations_online()
File "/usr/lib/python2.7/dist-packages/aodh/storage/sqlalchemy/alembic/env.py", line 75, in run_migrations_online
with connectable.connect() as connection:
File "/usr/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 2018, in connect
return self._connection_cls(self, **kwargs)
File "/usr/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 72, in __init__
if connection is not None else engine.raw_connection()
File "/usr/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 2104, in raw_connection
self.pool.unique_connection, _connection)
File "/usr/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 2078, in _wrap_pool_connect
e, dialect, self)
File "/usr/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 1401, in _handle_dbapi_exception_noconnection
util.raise_from_cause(newraise, exc_info)
File "/usr/lib/python2.7/dist-packages/sqlalchemy/util/compat.py", line 200, in raise_from_cause
reraise(type(exception), exception, tb=exc_tb)
File "/usr/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 2074, in _wrap_pool_connect
return fn()
File "/usr/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 318, in unique_connection
return _ConnectionFairy._checkout(self)
File "/usr/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 713, in _checkout
fairy = _ConnectionRecord.checkout(pool)
File "/usr/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 480, in checkout
rec = pool._do_get()
File "/usr/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 1060, in _do_get
self._dec_overflow()
File "/usr/lib/python2.7/dist-packages/sqlalchemy/util/langhelpers.py", line 60, in __exit__
compat.reraise(exc_type, exc_value, exc_tb)
File "/usr/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 1057, in _do_get
return self._create_connection()
File "/usr/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 323, in _create_connection
return _ConnectionRecord(self)
File "/usr/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 449, in __init__
self.connection = self.__connect()
File "/usr/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 607, in __connect
connection = self.__pool._invoke_creator(self)
File "/usr/lib/python2.7/dist-packages/sqlalchemy/engine/strategies.py", line 97, in connect
return dialect.connect(*cargs, **cparams)
File "/usr/lib/python2.7/dist-packages/sqlalchemy/engine/default.py", line 385, in connect
return self.dbapi.connect(*cargs, **cparams)
File "/usr/lib/python2.7/dist-packages/pymysql/__init__.py", line 88, in Connect
return Connection(*args, **kwargs)
File "/usr/lib/python2.7/dist-packages/pymysql/connections.py", line 679, in __init__
self.connect()
File "/usr/lib/python2.7/dist-packages/pymysql/connections.py", line 890, in connect
self._get_server_information()
File "/usr/lib/python2.7/dist-packages/pymysql/connections.py", line 1190, in _get_server_information
packet = self._read_packet()
File "/usr/lib/python2.7/dist-packages/pymysql/connections.py", line 966, in _read_packet
packet.check_error()
File "/usr/lib/python2.7/dist-packages/pymysql/connections.py", line 394, in check_error
err.raise_mysql_exception(self._data)
File "/usr/lib/python2.7/dist-packages/pymysql/err.py", line 120, in raise_mysql_exception
_check_mysql_exception(errinfo)
File "/usr/lib/python2.7/dist-packages/pymysql/err.py", line 115, in _check_mysql_exception
raise InternalError(errno, errorvalue)
DBError: (pymysql.err.InternalError) (1135, u'Can\'t create a new thread (errno 11 "Resource temporarily unavailable"); if you are not out of available memory, you can consult the manual for a possible OS-dependent bug')
使用 root 用户直接执行 aodh-dbsync 脚本,发现问题依然存在,初步排除数据库权限问题。
在 ERROR log 中发现 “Resource temporarily unavailable”,怀疑这是引起 aodh 数据库初始化失败的主要原因。在 /var/log/nova,/var/log/cinder,/var/log/heat 等目录下搜索 “Resource temporarily unavailable”,均在对应的 api.log 中发现。
Step2 验证 heat 服务
进入 /var/log/heat,发现在 heat-api.log 和 heat-engine.log 文件下都存在 “Resource temporarily unavailable” ERROR,但是 heat-api 和 heat-engine 服务均正常。
尝试直接登入 heat 数据库:
root@host3:/var/log/heat# mysql -u heat -pheat_db_secret -D heat
Reading table information for completion of table and column names
You can turn off this feature to get a quicker startup with -A
Welcome to the MariaDB monitor. Commands end with ; or \g.
Your MariaDB connection id is 825749
Server version: 10.0.26-MariaDB-1~xenial-wsrep mariadb.org binary distribution, wsrep_25.13.raf7f02e
Copyright (c) 2000, 2016, Oracle, MariaDB Corporation Ab and others.
Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.
MariaDB [heat]> exit
Bye
发现能够进入。
尝试验证 heat 服务:
root@host3:/var/log/heat# . /opt/admin-openrc.sh
root@host3:/var/log/heat# openstack orchestration service list
WARNING: openstackclient.common.utils is deprecated and will be removed after Jun 2017. Please use osc_lib.utils
+----------+-------------+------------------------+-------+--------+------------------------+--------+
| hostname | binary | engine_id | host | topic | updated_at | status |
+----------+-------------+------------------------+-------+--------+------------------------+--------+
| host1 | heat-engine | 6c20d3ab-b733-413c-b50 | host1 | engine | 2017-01-20T07:23:56.00 | down |
| | | 9-210170055236 | | | 0000 | |
| host3 | heat-engine | 7708e359-83b8-4087 | host3 | engine | 2017-01-20T07:32:23.00 | up |
| | | -973f-3f396734f5ce | | | 0000 | |
| host3 | heat-engine | a355d8cb-b1a4-4afb- | host3 | engine | 2017-01-20T07:32:45.00 | up |
| | | 99e8-35dbadd75241 | | | 0000 | |
| host1 | heat-engine | 8902b38d-3267-46b8 | host1 | engine | 2017-01-20T07:32:22.00 | up |
| | | -908f-d21f4ed5e0ee | | | 0000 | |
......
heat 服务正常。说明 “Resource temporarily unavailable” ERROR 应该是间断性地,或者对手动输入的命令没有影响。由此猜测该问题与数据库性能相关。
Step3 查看进程
root@host3:~# netstat -natp | grep mysql | wc -l
411
发现有 411 个进程在访问数据库,有可能正是因为大量的访问进程才导致 “Resource temporarily unavailable”。
查看各服务进程:
root@host3:/var/log/heat# ps aux | grep heat-engine | wc -l
74
root@host3:/var/log/heat# ps aux | grep heat-api | wc -l
75
root@host3:/var/log/heat# ps aux | grep nova-api | wc -l
146
root@host3:/var/log/heat# ps aux | grep cinder-api | wc -l
74
Step4 查找进程数量参数
查看 heat-engine 进程,找到代码入口。
root@host3:/var/log/heat# ps aux | grep heat-engine
root 1584 0.0 0.0 12948 1088 pts/1 S+ 23:41 0:00 grep --color=auto heat-en
heat 141782 1.5 0.2 382712 139640 ? Ss 23:16 0:23 /usr/bin/python /usr/bin/heat-engine --config-file=/etc/heat/heat.conf --log-file=/var/log/heat/heat-engine.log
heat 141801 0.0 0.1 386808 129572 ? S 23:16 0:01 /usr/bin/python /usr/bin/heat-engine --config-file=/etc/heat/heat.conf --log-file=/var/log/heat/heat-engine.log
heat 141802 0.0 0.1 386808 129620 ? S 23:16 0:01 /usr/bin/python /usr/bin/heat-engine --config-file=/etc/heat/heat.conf --log-file=/var/log/heat/heat-engine.log
查看 /usr/bin/heat-engine 文件
#!/usr/bin/python
# PBR Generated from u'console_scripts'
import sys
from heat.cmd.engine import main
if __name__ == "__main__":
sys.exit(main())
进入 heat/cmd 目录
root@host3:/var/log/heat# cd /usr/lib/python2.7/dist-packages/heat/cmd/
root@host3:/usr/lib/python2.7/dist-packages/heat/cmd# ll
total 68
drwxr-xr-x 2 root root 4096 Jan 19 23:45 ./
drwxr-xr-x 16 root root 4096 Jan 19 03:07 ../
-rw-r--r-- 1 root root 2245 Oct 6 06:13 api_cfn.py
-rw-r--r-- 1 root root 2224 Jan 19 03:07 api_cfn.pyc
-rw-r--r-- 1 root root 2336 Oct 6 06:13 api_cloudwatch.py
-rw-r--r-- 1 root root 2267 Jan 19 03:07 api_cloudwatch.pyc
-rw-r--r-- 1 root root 2024 Oct 6 06:13 api.py
-rw-r--r-- 1 root root 2032 Jan 19 03:07 api.pyc
-rw-r--r-- 1 root root 2556 Oct 6 06:13 engine.py
-rw-r--r-- 1 root root 2505 Jan 19 03:07 engine.pyc
-rw-r--r-- 1 root root 0 Oct 6 06:13 __init__.py
-rw-r--r-- 1 root root 140 Jan 19 03:07 __init__.pyc
-rw-r--r-- 1 root root 8894 Oct 6 06:13 manage.py
-rw-r--r-- 1 root root 8407 Jan 19 03:07 manage.pyc
查看 engine.py 文件
def main():
logging.register_options(cfg.CONF)
cfg.CONF(project='heat', prog='heat-engine',
version=version.version_info.version_string())
logging.setup(cfg.CONF, 'heat-engine')
logging.set_defaults()
messaging.setup()
config.startup_sanity_check()
mgr = None
try:
mgr = template._get_template_extension_manager()
except template.TemplatePluginNotRegistered as ex:
LOG.critical(_LC("%s"), ex)
if not mgr or not mgr.names():
sys.exit("ERROR: No template format plugins registered")
from heat.engine import service as engine # noqa
profiler.setup('heat-engine', cfg.CONF.host)
gmr.TextGuruMeditation.setup_autorun(version)
srv = engine.EngineService(cfg.CONF.host, rpc_api.ENGINE_TOPIC)
workers = cfg.CONF.num_engine_workers
if not workers:
workers = max(4, processutils.get_worker_count())
launcher = service.launch(cfg.CONF, srv, workers=workers)
if cfg.CONF.enable_cloud_watch_lite:
# We create the periodic tasks here, which mean they are created
# only in the parent process when num_engine_workers>1 is specified
srv.create_periodic_tasks()
launcher.wait()
走读代码,发现 workers
这个变量。基本断定这个变量控制了 heat-engine 的进程数量,并且设置了最小值为 4,并且猜测 get_worker_count()
函数功能是服务器的 CPU 核数计算最佳性能的进程数量。
在 /etc/heat 目录下搜索 num_engine_workers:
root@host3:/etc/heat# grep -rn "num_engine_workers" -C 5
heat.conf.55490.2017-01-19@03:08:10~-204-# Maximum depth allowed when using nested stacks. (integer value)
heat.conf.55490.2017-01-19@03:08:10~-205-#max_nested_stack_depth = 5
heat.conf.55490.2017-01-19@03:08:10~-206-
heat.conf.55490.2017-01-19@03:08:10~-207-# Number of heat-engine processes to fork and run. Will default to either to 4
heat.conf.55490.2017-01-19@03:08:10~-208-# or number of CPUs on the host, whichever is greater. (integer value)
heat.conf.55490.2017-01-19@03:08:10~:209:#num_engine_workers = <None>
heat.conf.55490.2017-01-19@03:08:10~-210-
heat.conf.55490.2017-01-19@03:08:10~-211-#
heat.conf.55490.2017-01-19@03:08:10~-212-# From heat.common.crypt
heat.conf.55490.2017-01-19@03:08:10~-213-#
heat.conf.55490.2017-01-19@03:08:10~-214-
Number of heat-engine processes to fork and run. Will default to either to 4 or number of CPUs on the host, whichever is greater. (integer value)
Step5 本地测试
在 /etc/heat/heat.conf 文件中加入 num_engine_workers = 10
。
root@host3:/etc/heat# ps aux | grep heat-engine | wc -l
74
root@host3:/etc/heat# vim heat.conf
root@host3:/etc/heat# service heat-engine restart
root@host3:/etc/heat# ps aux | grep heat-engine | wc -l
12
可见加入这个配置项之后,heat-engine 的进程数量显著减少。
Step6 提交 Patch,修复 Bug
diff --git a/deploy/adapters/ansible/openstack/templates/nova.conf b/deploy/adapters/ansible/openstack
index 4a7bb0a..99071d7 100644
--- a/deploy/adapters/ansible/openstack/templates/nova.conf
+++ b/deploy/adapters/ansible/openstack/templates/nova.conf
@@ -39,6 +39,10 @@ notification_driver = nova.openstack.common.notifier.rpc_notifier
notification_driver = ceilometer.compute.nova_notifier
memcached_servers = {{ memcached_servers }}
+{% if ansible_processor_vcpus > 30 %}
+osapi_compute_workers = 30
+{% endif %}
+
[database]
# The SQLAlchemy connection string used to connect to the database
connection = mysql://nova:{{ NOVA_DBPASS }}@{{ db_host }}/nova
diff --git a/deploy/adapters/ansible/roles/cinder-controller/templates/cinder.conf b/deploy/adapters/a
index d428a07..05a1c8f 100644
--- a/deploy/adapters/ansible/roles/cinder-controller/templates/cinder.conf
+++ b/deploy/adapters/ansible/roles/cinder-controller/templates/cinder.conf
@@ -35,6 +35,9 @@ quota_driver = cinder.quota.DbQuotaDriver
osapi_volume_listen = {{ storage_controller_host }}
osapi_volume_listen_port = 8776
+{% if ansible_processor_vcpus > 30 %}
+osapi_volume_workers = 30
+{% endif %}
db_backend = sqlalchemy
volume_name_template = volume-%s
由于 nova-api,cinder-api,heat-api,heat-engine 的进程数量较大,对它们加入了最大值限制。
Patch 地址:FIX access database failed
Bug 引入原因
由于数据库的性能瓶颈,导致各个服务利用 CPU 核心数计算出来的最佳性能进程数量压垮了 MySQL。这也从侧面反映了 Compass4NFV 在本地进行虚拟部署时非常稳定,但是一旦进行物理部署,就会出现各种问题。服务器性能越好,CPU 核心数越多,部署成功的概率反而越低。
问题反思
有些偶然性问题导致部署失败,虽然重新构建一把说不定就能够过去,但是这颗炸弹会一直隐藏在代码中。所以面对 CI 上那些低概率的问题,应该更加重视,及时优化代码。