【案例分享】ceph故障处理 - 典型案例 - 安图特（北京）科技有限公司

【案例分享】ceph故障处理

一、故障描述

安图特接到客户服务请求，到达现场发现ceph状态报警，提示其中某一个pg状态报错，造成服务响应缓慢，涉及osd.1：

[root@dfs01 ceph]# ceph -s

cluster:

id: 798fb87a-0d6c-4c20-8298-95074eb642fe

health: HEALTH_WARN

Reduced data availability: 1 pg inactive, 1 pg stale

Degraded data redundancy: 1 pg undersized

services:

mon: 5 daemons, quorum dfs01,dfs02,dfs03,dfs04,dfs05 (age 17h)

mgr: mgr1(active, since 17h), standbys: mgr2, mgr3

mds: ora_arch:1 {0=dfs02=up:active} 2 up:standby

osd: 6 osds: 6 up (since 14h), 6 in (since 14h)

data:

pools: 5 pools, 417 pgs

objects: 38 objects, 85 KiB

usage: 6.1 GiB used, 24 GiB / 30 GiB avail

pgs: 0.240% pgs not active

416 active+clean

1 stale+undersized+peered

progress:

Rebalancing after osd.1 marked in (14h)

[............................]

PG autoscaler decreasing pool 4 PGs from 128 to 32 (13h)

[............................]

PG autoscaler decreasing pool 5 PGs from 128 to 16 (13h)

[............................]

PG autoscaler decreasing pool 2 PGs from 128 to 32 (14h)

[............................]

二、故障排查

2.1 查看osd状态

osd状态均正常：

[root@dfs01 ceph]# ceph osd status

ID HOST USED AVAIL WR OPS WR DATA RD OPS RD DATA STATE

1 dfs01 1034M 4081M 0 0 0 0 exists,up

2 dfs02 1034M 4081M 0 0 0 0 exists,up

3 dfs03 1034M 4081M 0 0 0 0 exists,up

4 dfs04 1034M 4081M 0 0 0 0 exists,up

5 dfs05 1034M 4081M 0 0 0 0 exists,up

6 dfs06 1034M 4081M 0 0 0 0 exists,up

[root@dfs01 ceph]# ceph osd stat

6 osds: 6 up (since 14h), 6 in (since 14h); epoch: e76

[root@dfs01 ceph]# ceph osd tree

ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF

-1 0.02939 root default

-3 0.00490 host dfs01

1 hdd 0.00490 osd.1 up 1.00000 1.00000

-5 0.00490 host dfs02

2 hdd 0.00490 osd.2 up 1.00000 1.00000

-7 0.00490 host dfs03

3 hdd 0.00490 osd.3 up 1.00000 1.00000

-9 0.00490 host dfs04

4 hdd 0.00490 osd.4 up 1.00000 1.00000

-11 0.00490 host dfs05

5 hdd 0.00490 osd.5 up 1.00000 1.00000

-13 0.00490 host dfs06

6 hdd 0.00490 osd.6 up 1.00000 1.00000

2.2 查找故障PG

[root@dfs01 ceph]# ceph health detail

HEALTH_WARN Reduced data availability: 1 pg inactive, 1 pg stale; Degraded data redundancy: 1 pg undersized

[WRN] PG_AVAILABILITY: Reduced data availability: 1 pg inactive, 1 pg stale

pg 1.0 is stuck stale for 14h, current state stale+undersized+peered, last acting [0]

[WRN] PG_DEGRADED: Degraded data redundancy: 1 pg undersized

pg 1.0 is stuck undersized for 14h, current state stale+undersized+peered, last acting [0]

通过如上命令，发现pg1.0有问题，查询pg具体信息：

[root@dfs01 ceph]# ceph pg 1.0 query

Error ENOENT: i don't have pgid 1.0

[root@dfs01 ceph]# ceph pg dump_stuck inactive

[root@dfs01 ceph]# ceph pg dump_stuck unclean

PG_STAT STATE UP UP_PRIMARY ACTING ACTING_PRIMARY

1.0 stale+undersized+peered [0]

2.3 查看存储池信息

[root@dfs01 ceph]# ceph osd lspools

1 device_health_metrics

2 database_pool

4 fs_data

5 fs_metadata

[root@dfs01 ceph]# ceph osd pool ls detail

pool 1 'device_health_metrics' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 1 pgp_num 1 autoscale_mode on last_change 12 flags hashpspool stripe_width 0 pg_num_min 1 application mgr_devicehealth

pool 2 'database_pool' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 128 pgp_num 128 pg_num_target 32 pgp_num_target 32 autoscale_mode on last_change 58 flags hashpspool,selfmanaged_snaps stripe_width 0 application rbd

pool 4 'fs_data' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 128 pgp_num 128 pg_num_target 32 pgp_num_target 32 autoscale_mode on last_change 75 flags hashpspool stripe_width 0 application cephfs

pool 5 'fs_metadata' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 128 pgp_num 128 pg_num_target 16 pgp_num_target 16 autoscale_mode on last_change 76 flags hashpspool stripe_width 0 pg_autoscale_bias 4 pg_num_min 16 recovery_priority 5 application cephfs

三、故障处理

3.1 查询受影响的存储池

ceph pg ls-by-pool device_health_metrics|grep ^1.0

通过以上命令查询到，此pg影响device_health_metrics存储池，device_health_metrics为非重要核心存储池，故进行pg重建。

3.2 尝试修复

ceph pg repair 1.0

3.3 重建PG

ceph osd force-create-pg 1.0 --yes-i-really-mean-it

若查询所有可能的位置仍丢失了对象，则放弃丢失的对象，将“未找到”的对象标记为“丢失”。

[root@dfs01 ceph]# ceph pg 1.0 query

{

"snap_trimq": "[]",

"snap_trimq_len": 0,

"state": "active+clean",

"epoch": 106,

"up": [

"acting": [

"acting_recovery_backfill": [

"2",

"3",

"4"

"info": {

"pgid": "1.0",

"last_update": "0'0",

"last_complete": "0'0",

"log_tail": "0'0",

"last_user_version": 0,

"last_backfill": "MAX",

"purged_snaps": [],

"history": {

"epoch_created": 77,

"epoch_pool_created": 77,

"last_epoch_started": 79,

"last_interval_started": 77,

"last_epoch_clean": 79,

"last_interval_clean": 77,

"last_epoch_split": 0,

"last_epoch_marked_full": 0,

"same_up_since": 77,

"same_interval_since": 77,

"same_primary_since": 77,

"last_scrub": "0'0",

"last_scrub_stamp": "2021-10-09T10:16:06.538634+0800",

"last_deep_scrub": "0'0",

"last_deep_scrub_stamp": "2021-10-09T10:16:06.538634+0800",

"last_clean_scrub_stamp": "2021-10-09T10:16:06.538634+0800",

"prior_readable_until_ub": 0

"stats": {

"version": "0'0",

"reported_seq": 37,

"reported_epoch": 106,

"state": "active+clean",

"last_fresh": "2021-10-09T10:16:48.108134+0800",

"last_change": "2021-10-09T10:16:08.944500+0800",

"last_active": "2021-10-09T10:16:48.108134+0800",

"last_peered": "2021-10-09T10:16:48.108134+0800",

"last_clean": "2021-10-09T10:16:48.108134+0800",

"last_became_active": "2021-10-09T10:16:08.943940+0800",

"last_became_peered": "2021-10-09T10:16:08.943940+0800",

"last_unstale": "2021-10-09T10:16:48.108134+0800",

"last_undegraded": "2021-10-09T10:16:48.108134+0800",

"last_fullsized": "2021-10-09T10:16:48.108134+0800",

"mapping_epoch": 77,

"log_start": "0'0",

"ondisk_log_start": "0'0",

"created": 77,

"last_epoch_clean": 79,

"parent": "0.0",

"parent_split_bits": 0,

"last_scrub": "0'0",

"last_scrub_stamp": "2021-10-09T10:16:06.538634+0800",

"last_deep_scrub": "0'0",

"last_deep_scrub_stamp": "2021-10-09T10:16:06.538634+0800",

"last_clean_scrub_stamp": "2021-10-09T10:16:06.538634+0800",

"log_size": 0,

"ondisk_log_size": 0,

"stats_invalid": false,

"dirty_stats_invalid": false,

"omap_stats_invalid": false,

"hitset_stats_invalid": false,

"hitset_bytes_stats_invalid": false,

"pin_stats_invalid": false,

"manifest_stats_invalid": false,

"snaptrimq_len": 0,

}

四、经验总结

4.1 多副本存储池

针对重要数据存储建立2/3副本配置。

4.2 谨慎操作

更换硬盘应严格按照流程操作，首先标记并停止osd服务，再删除osd，最后更换硬盘。

4.3 状态解释

Undersized：PG当前的Acting Set小于存储池副本数；

Peer：Peering已经完成，但是PG当前Acting Set规模小于存储池规定的最小副本数(min_size)。

如欲了解更多，请登录安图特官方网站:www.antute.com.cn

运维管理

硬件维保

软件维护

DC迁移

实施服务