Galera Cluster for MySQL of step by step install manual

1. 简述

  1. 基础系统为CentOS 9-stream(已支持CentOS 7.9 2009)。
  2. 安装一键安装包,安装时设置不安装mysql:./plasma_install_{version}.sh -off-my
  3. 将依赖包mysql-wsrep-rpms.tgz上传至服务器。

2. 安装依赖

# 卸载默认数据库
rpm -qa | grep mysql | xargs rpm -e

# 停止SELINUX
setenforce 0 && sed '/^SELINUX=/s/=.*/=disabled/p' -i /etc/selinux/config

# 开放防火墙端口
firewall-cmd --add-port=3306/tcp --add-port=4567/tcp --add-port=4567/udp --add-port=4568/tcp --add-port=4444/tcp --permanent

firewall-cmd --reload

# 安装依赖
tar zxvf ./mysql-wsrep-rpms.tgz -C /plasma

if [ `uname -r | awk -F'.' '{NF=NF-1;print $NF}'` == 'el7' ];then
    find /plasma/.mysql-wsrep-rpms/el7/*.rpm -type f -exec rpm -iUvh --nodeps {} \;
    cp /plasma/.mysql-wsrep-rpms/el7/libstdc++.so.6.0.26 /usr/lib64/
    cd /usr/lib64/
    unlink libstdc++.so.6
    ln libstdc++.so.6.0.26 libstdc++.so.6
    cd $OLDPWD
else
    find /plasma/.mysql-wsrep-rpms/el9/*.rpm -type f -exec rpm -iUvh --nodeps {} \;
fi;

find /plasma/.mysql-wsrep-rpms/*.rpm -maxdepth 1 -type f -exec rpm -iUvh --nodeps {} \;

# 编译安装openssl 1.1.1u依赖
tar zxvf /plasma/.mysql-wsrep-rpms/openssl-1.1.1u.tar.gz -C /usr/local/src

cd /usr/local/src/openssl-1.1.1u

./config shared --openssldir=/usr/local/lib64/openssl-1.1.1u --prefix=/usr/local/lib64/openssl-1.1.1u

make && make install

if [ $? == 0 ]; then
    ln -s /usr/local/lib64/openssl-1.1.1u/lib/libssl.so.1.1 /usr/lib64/libssl.so.1.1
    ln -s /usr/local/lib64/openssl-1.1.1u/lib/libcrypto.so.1.1 /usr/lib64/libcrypto.so.1.1
else
    echo "安装失败"
fi;

# 返回原工作目录
cd $OLDPWD

3. 初始化配置文件、数据库

# 配置MYSQL节点信息
# 当前节点IP地址
NODE_HOST="192.168.0.132"
# 节点名,各节点名称应有区别
NODE_NAME="node3"
# 节点id,各节点id不允许重复
NODE_ID=3
# 节点池,将各节点IP地址填入,逗号分隔,第一个节点配置时暂时不加入ip,应置空填"",表示新创建集群的意思,一但加ip会报配置文件错误。
# 需要在后续集群创建完成加入各个节点后在回来填写ip。
# 启动集群后修改,再重启主节点
NODE_COMM="192.168.0.130,192.168.0.131,192.168.0.132"

# 导入配置文件
cat > /etc/my.cnf <<EOF
[mysql]
socket    = /plasma/mysql/mysql.sock
port	  = 3306

[mysqladmin]
socket    = /plasma/mysql/mysql.sock
port	  = 3306

[mysqldump]
socket    = /plasma/mysql/mysql.sock
port	  = 3306
max_allowed_packet = 1G

[mysqld]
# 集群配置
wsrep_provider = /usr/lib64/galera-4/libgalera_smm.so
wsrep_cluster_name='plasma' 
wsrep_cluster_address = "gcomm://$NODE_COMM" 
wsrep_node_name = "$NODE_NAME" 
wsrep_node_address="$NODE_HOST"
wsrep_on=ON 
binlog_format=ROW 
binlog_row_image=minimal
default_storage_engine=InnoDB 
innodb_autoinc_lock_mode=2 
wsrep_applier_threads=1 
innodb_flush_log_at_trx_commit=2 
innodb_buffer_pool_size=1024M 
wsrep_sst_method=rsync
wsrep_sst_auth=USERNAME:PASSWORD
bind-address=0.0.0.0
wsrep_provider_options="gcache.size=256M; gcache.page_size=128M"
server_id = $NODE_ID

# 数据存储路径
datadir   = /plasma/mysql
socket    = /plasma/mysql/mysql.sock
pid-file  = /plasma/mysql/mysqld.pid

# 日志存储路径
user      = mysql
log-error = /plasma/mysql/mysqld.err
log-bin = mysql-bin
slow_query_log

# mysqld运行参数
character-set-server = utf8mb4
collation-server = utf8mb4_unicode_ci
init_connect='SET collation_connection = utf8mb4_unicode_ci'
init_connect='SET NAMES utf8mb4'

lower_case_table_names=1
skip-name-resolve
log_bin_trust_function_creators
event_scheduler = on
back_log = 512
max_connections = 3000
max_connect_errors = 50
max_allowed_packet = 1G
max_heap_table_size = 512M
table_open_cache = 5120
read_buffer_size = 64M
read_rnd_buffer_size = 64M
sort_buffer_size = 128M
join_buffer_size = 128M
tmp_table_size = 512M
bulk_insert_buffer_size = 256M
innodb_log_file_size = 512M
innodb_buffer_pool_size = 8G
innodb_data_file_path = ibdata1:512M:autoextend
innodb_lock_wait_timeout = 420
sql_mode=STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION
EOF

# 临时注释wsrp配置
sed "/^wsrep/s/^/#/" -i /etc/my.cnf

# 启动数据库,并初始化
systemctl enable --now mysqld

# MySQL账户初始化
GENERATE_PWD="$(sed -ne '/MY-010454/p' /plasma/mysql/mysqld.err | awk '{print $NF}')"

mysqladmin -p"$GENERATE_PWD" password "PASSWORD"

mysql -p"PASSWORD" <<EOF
create user USERNAME@"localhost" identified with mysql_native_password by "PASSWORD";
create user USERNAME@"%" identified with mysql_native_password by "PASSWORD";
grant all privileges on *.* to USERNAME@"%";
grant all privileges on *.* to USERNAME@"localhost";
flush privileges
EOF

# 停止数据库
systemctl stop mysqld

4. 启动节点

# -- 启动节点 --
# 取消集群配置注释
sed -i '/^#wsrep/s/^#//' /etc/my.cnf

# 创建集群
# 注意!启动主节点时执行该命令,非主节点不需要
mysqld_bootstrap

# 非主节点使用systemd来管理启动
# 启用mysqld
systemctl start mysqld

5. 集群启动检查

show status like 'wsrep_cluster_size';  # 显示集群里节点数
show status like 'wsrep_connected';  # 应该显示ON
show status like 'wsrep_incoming_addresses';  # 显示节点ip
show status like 'wsrep_local_state_comment';  # 显示节点的同步状态

6. 主节点配置完善

# 登录主节点mysql命令行
SQL> show status like 'wsrep_cluster_size';  # 显示集群里节点数
# 检查其他节点是否有加入群集
# 修改主节点/etc/my.cnf,添加群集节点信息
NODE_COMM="192.168.0.130,192.168.0.131,192.168.0.132"

sed '/^wsrep_cluster_address/s/\"gcomm:\/\/.*\"/\"gcomm:\/\/'$NODE_COMM'\"/' -i /etc/my.cnf

7. 常见集群故障排除

集群启动错误时处理方案

# 启动主集群时
mysqld_bootstrap

# 提示报错信息时:
# Job for mysqld.service failed because the control process exited with error code.
# See "systemctl status mysqld.service" and "journalctl -xeu mysqld.service" for details.

# 检查最后报错信息
tail /plasma/mysql/mysqld.err | grep ERROR

# 如出现如下报错信息:
# It may not be safe to bootstrap the cluster from this node. It was not the last one to leave the cluster and may not contain all the updates. To force cluster bootstrap with this node, edit the grastate.dat file manually and set safe_to_bootstrap to 1 
# 中文意思为: 从该节点引导群集可能不安全。它不是最后一个离开集群的,可能不包含所有更新。要使用此节点强制集群引导,请手动编辑grastate.dat文件,并将safe_To_bootstrap设置为1

# 数据库集群宕机,在运行/bin/galera_new_cluster启动第一个节点时报错,意思是该节点不是最后一个停掉的,不能安全启动;
# 然后可以尝试在其他节点运行该命令;
# 如果报错都相同,则需要从3个节点中选取一个主节点,修改/plasma/mysql/grastate.dat,把其中safe_to_bootstrap的值改为1即可。然后运行mysqld_bootstrap。其他节点依次启动

sed '/^safe_to_bootstrap/s/0/1/' -i /plasma/mysql/grastate.dat

# 然后启动集群
mysqld_bootstrap

mysqld.err日志报MySQL Unknown/unsupported storage engine: INNODB错误时处理方案

# 
rm -f /plasma/mysql/{id_logfile0,id_logfile1,ibdata1}
# 重新启动服务

集群脑裂时处理方案

# mysqld.err日志报[ERROR] WSREP: failed to open gcomm backend connection: 110错误时。
# 1、数据库如果正常关闭的话,gvwstate.dat文件是不存在的,如果三个节点同时关闭(无数据变化)seqno值应该是相等的,且不为-1。
# 2、如果这个时候无论先后顺利启动节点,都会导致所有节点都在不断重启中,并且seqno值由原来的值变为-1。
# 3、此时无法选举出来哪个作为主启动节点,需要再第一个节点启动时手动指定该节点作为集群主节点启动。
# 解决办法:
# 主节点使用mysqld_bootstrap启动
mysqld_bootstrap

# 正常启动后的节点将重新生成gvwstate.dat文件

Author: Andy ho.