ceph active+undersized 警告

ceph active+undersized warning

设置:


    # hdd k=22 m=14 64% overhead. Withstands 14 hdd osd failures. This includes 
    # tolerating one host failure and additional 2 osd failures on top. 
    ceph osd erasure-code-profile set hdd_k22_m14_osd \
    k=22 \
    m=14 \
    crush-device-class=hdd \
    crush-failure-domain=osd
    
    # ssd k=44 m=28 64% overhead. Withstands 28 ssd osd failures. This includes 
    # tolerating one host failure and additional 4 osd failures on top. 
    ceph osd erasure-code-profile set ssd_k44_m28_osd \
    k=44 \
    m=28 \
    crush-device-class=ssd \
    crush-failure-domain=osd
    
    # creating erasure code pool  min_size=k+2
    ceph osd pool create cephfs.vol1.test.hdd.ec erasure hdd_k22_m14_osd 
    ceph osd pool set cephfs.vol1.test.hdd.ec allow_ec_overwrites true
    ceph osd pool set cephfs.vol1.test.hhd.ec pg_num 128
    ceph osd pool set cephfs.vol1.test.hhd.ec pgp_num 128
    ceph osd pool set cephfs.vol1.test.hdd.ec min_size 24
    
    # creating erasure code pool 
    ceph osd pool create cephfs.vol1.test.ssd.ec erasure ssd_k44_m28_osd 
    ceph osd pool set cephfs.vol1.test.ssd.ec allow_ec_overwrites true
    ceph osd pool set cephfs.vol1.test.ssd.ec pg_num 128
    ceph osd pool set cephfs.vol1.test.ssd.ec pgp_num 128
    ceph osd pool set cephfs.vol1.test.ssd.ec min_size 46


    # ceph osd erasure-code-profile get hdd_k22_m14_osd
    crush-device-class=hdd
    crush-failure-domain=osd
    crush-root=default
    jerasure-per-chunk-alignment=false
    k=22
    m=14
    plugin=jerasure
    technique=reed_sol_van
    w=8


    # ceph osd pool ls detail | grep hdd
    pool 16 'cephfs.vol1.test.hdd.ec' erasure profile hdd_k22_m14_osd size 36 min_size 24 crush_rule 7 object_hash rjenkins pg_num 253 pgp_num 241 pg_num_target 128 pgp_num_target 128 autoscale_mode on last_change 17748 lfor 0/7144/7142 flags hashpspool,ec_overwrites stripe_width 90112 target_size_bytes 344147139493888 application cephfs
    # ceph osd pool ls detail | grep ssd
    pool 17 'cephfs.vol1.test.ssd.ec' erasure profile ssd_k44_m28_osd size 72 min_size 46 crush_rule 8 object_hash rjenkins pg_num 128 pgp_num 128 autoscale_mode on last_change 13591 lfor 0/0/7109 flags hashpspool,ec_overwrites stripe_width 180224 target_size_bytes 113249697660928 application cephfs


    {
        "rule_id": 7,
        "rule_name": "cephfs.vol1.test.hdd.ec",
        "ruleset": 7,
        "type": 3,
        "min_size": 3,
        "max_size": 36,
        "steps": [
            {
                "op": "set_chooseleaf_tries",
                "num": 5
            },
            {
                "op": "set_choose_tries",
                "num": 100
            },
            {
                "op": "take",
                "item": -2,
                "item_name": "default~hdd"
            },
            {
                "op": "choose_indep",
                "num": 0,
                "type": "osd"
            },
            {
                "op": "emit"
            }
        ]
    }


    {
        "rule_id": 8,
        "rule_name": "cephfs.vol1.test.ssd.ec",
        "ruleset": 8,
        "type": 3,
        "min_size": 3,
        "max_size": 72,
        "steps": [
            {
                "op": "set_chooseleaf_tries",
                "num": 5
            },
            {
                "op": "set_choose_tries",
                "num": 100
            },
            {
                "op": "take",
                "item": -12,
                "item_name": "default~ssd"
            },
            {
                "op": "choose_indep",
                "num": 0,
                "type": "osd"
            },
            {
                "op": "emit"
            }
        ]
    } 

问题:

但是,此设置似乎不起作用并给出:


    # ceph -s 
      cluster:
        id:     <id>
        health: HEALTH_WARN
                Degraded data redundancy: 19 pgs undersized
                20 pgs not deep-scrubbed in time


    # ceph health detail
        pg 17.0 is stuck undersized for 7h, current state active+undersized, last acting [92,76,44,84,46,72,102,104,59,62,60,89,40,47,65,38,95,79,43,67,91,69,80,83,94,48,42,90,88,37,49,75,53,58,93,45,96,61,106,64,52,70,77,99,107,63,97,100,56,98,87,105,36,68,103,55,85,2147483647,82,66,51,101,81,54,78,74,39,50,73,71,57,41]
        pg 17.1 is stuck undersized for 7h, current state active+undersized, last acting [69,59,75,104,79,83,89,51,76,102,37,54,95,60,105,87,43,91,70,101,45,94,68,57,72,107,53,49,40,50,65,61,88,84,73,58,47,96,48,100,103,42,52,71,63,86,39,64,97,41,46,81,67,36,93,82,62,38,98,90,85,2147483647,44,99,55,80,78,56,92,66,106,77]
        pg 17.4 is stuck undersized for 7h, current state active+undersized, last acting [46,84,96,39,38,94,82,67,103,63,50,52,106,42,61,64,45,62,74,79,101,48,2147483647,85,105,59,72,81,91,60,56,71,102,77,70,57,54,100,49,75,36,53,92,98,58,83,51,69,44,89,65,47,43,41,99,107,90,76,37,68,80,40,55,93,104,66,95,78,86,97,73,88]
        pg 17.5 is stuck undersized for 7h, current state active+undersized, last acting [63,64,93,82,69,90,60,102,89,104,50,103,55,52,66,98,99,65,100,48,53,76,68,62,84,87,57,42,75,46,83,71,43,92,51,44,80,56,61,88,77,37,38,39,81,74,105,49,85,41,91,36,79,54,45,94,67,101,72,96,47,73,86,2147483647,106,97,70,107,59,78,40,95]
        pg 17.6 is stuck undersized for 7h, current state active+undersized, last acting [48,67,88,105,97,78,92,79,58,59,46,98,91,45,96,52,38,57,41,81,73,49,89,55,86,68,37,39,77,47,83,76,54,94,44,70,43,62,42,60,104,64,84,85,63,102,87,90,71,80,103,100,101,40,50,72,75,95,51,82,53,36,65,61,106,93,2147483647,99,56,74,107,66]
        pg 17.7 is stuck undersized for 7h, current state active+undersized, last acting [69,79,84,103,37,60,75,42,67,40,65,90,99,85,63,91,83,58,104,56,43,62,55,86,82,72,73,106,87,68,57,50,64,96,41,39,61,71,93,97,59,92,102,81,38,98,48,51,95,101,52,74,77,53,44,49,45,107,78,88,70,105,46,54,80,36,47,89,76,66,100,2147483647]
        pg 17.8 is stuck undersized for 7h, current state active+undersized, last acting [71,78,99,81,43,58,54,86,95,82,52,46,73,69,97,39,93,88,59,105,103,91,50,101,102,49,51,64,98,90,84,75,42,107,56,83,60,67,70,55,104,61,66,79,96,74,63,72,92,53,2147483647,100,62,77,45,87,85,89,76,80,37,44,68,57,41,94,40,48,38,47,65,36]
        pg 17.a is stuck undersized for 7h, current state active+undersized, last acting [65,42,58,61,52,57,60,85,100,75,98,40,74,79,38,72,91,48,93,80,54,41,83,95,76,49,46,71,55,88,63,94,73,44,45,102,89,107,92,86,53,103,47,43,56,82,104,106,51,37,36,39,99,97,59,81,64,66,84,96,90,77,87,78,50,105,62,67,69,70,101,2147483647]
        pg 17.b is stuck undersized for 7h, current state active+undersized, last acting [47,54,59,93,91,36,58,98,39,60,46,49,78,64,88,100,66,107,92,83,99,56,63,87,41,96,89,45,51,76,69,71,103,94,90,50,85,68,81,73,75,105,40,79,84,44,80,37,42,52,95,70,62,55,82,53,38,72,65,2147483647,48,106,43,101,104,86,61,57,102,77,74,67]
        pg 17.d is stuck undersized for 7h, current state active+undersized, last acting [92,83,39,44,75,98,96,61,41,64,38,97,63,37,70,68,87,90,36,77,73,60,69,55,93,47,2147483647,56,102,50,54,91,82,58,43,67,53,86,81,95,105,52,85,51,79,46,62,49,80,40,57,42,104,107,78,84,94,103,48,72,88,74,71,45,101,99,65,59,106,66,100,76]
        pg 17.10 is stuck undersized for 7h, current state active+undersized, last acting [96,94,52,46,43,50,82,97,75,84,53,106,91,78,64,65,42,95,98,87,69,99,77,59,76,2147483647,49,70,79,90,105,81,107,86,45,39,55,93,92,56,72,37,101,36,85,100,67,47,104,74,63,38,48,68,44,60,57,61,40,88,51,62,71,83,58,89,103,80,102,41,54,73]
        pg 17.13 is stuck undersized for 7h, current state active+undersized, last acting [46,55,50,77,73,97,45,57,67,95,103,38,90,106,66,87,36,44,82,49,100,107,84,88,102,40,65,60,43,70,42,86,48,39,71,74,99,56,59,96,72,92,101,62,93,51,47,52,85,53,104,76,37,79,58,94,81,64,83,68,69,63,54,80,98,61,78,105,2147483647,91,75,41]
        pg 17.14 is stuck undersized for 7h, current state active+undersized, last acting [105,62,66,55,53,51,97,50,65,90,104,56,74,52,70,100,42,107,101,40,58,63,44,49,59,69,38,80,73,102,36,76,106,75,39,99,92,60,94,91,89,41,46,72,88,2147483647,87,98,71,78,54,68,84,95,57,103,81,82,96,61,67,79,37,83,86,47,93,77,64,48,85,45]
        pg 17.19 is stuck undersized for 7h, current state active+undersized, last acting [50,90,73,99,45,101,72,93,85,47,59,78,95,83,96,58,76,39,43,49,44,92,91,102,81,74,62,86,54,56,103,87,70,105,75,48,88,97,67,38,57,46,36,84,107,66,65,69,106,41,80,42,52,63,64,61,98,100,79,60,51,94,53,89,37,68,40,55,77,71,2147483647,104]
        pg 17.1a is stuck undersized for 7h, current state active+undersized, last acting [70,95,59,78,87,85,66,68,40,63,90,73,89,101,86,80,82,50,107,74,55,49,72,48,43,104,62,97,81,94,103,58,77,52,2147483647,102,53,75,106,91,88,57,42,61,99,79,39,54,38,96,37,45,76,105,51,84,60,47,93,98,83,100,64,65,44,36,56,71,67,46,41,69]
        pg 17.1b is stuck undersized for 7h, current state active+undersized, last acting [84,37,62,58,87,36,94,77,53,55,45,93,43,82,75,78,101,104,95,106,98,107,61,99,38,46,52,76,56,51,66,83,42,80,63,81,79,86,100,90,88,65,47,60,44,103,2147483647,73,59,69,102,67,57,70,72,41,105,54,64,91,97,48,74,89,92,96,40,71,50,39,49,68]
        pg 17.1e is stuck undersized for 7h, current state active+undersized, last acting [103,48,71,70,104,47,77,56,55,89,68,97,72,82,36,69,40,83,107,38,80,76,39,100,92,79,57,37,42,66,98,53,62,43,84,95,75,105,59,94,106,45,88,54,96,67,91,46,44,58,2147483647,93,73,64,85,78,101,65,50,99,74,102,49,51,41,61,87,90,52,63,60,81]

并且外部集群 rook pvc 挂载无法写入。

这里做错了什么?为什么 pg(s) 尺寸过小?

这个设计真的很糟糕,你应该从头开始。首先,您创建的块数太多,没有必要这样做。让所有主机都参与进来也是一个糟糕的选择,因为如果主机甚至 OSD 发生故障,就没有恢复空间,因此您的集群将处于降级状态,直到发生故障的主机或 OSD 重新上线。其次,OSD 作为故障域也不是一个好的选择,通常你会选择 host 作为故障域。对于相对较小的设置,我宁愿选择大小为 6 的复制池(每个节点 2 个副本,您可以丢失 2 个主机而不会丢失数据)。如果您真的需要使用 EC,请注意您将无法承受主机丢失,因为没有足够的 space 来恢复。您可以选择像 k2 m4 这样的配置文件——或者如果您想要更多的块,则将其设为 k3 m6——并将 OSD 保留为故障域,但正如我所说,它不是很有弹性。你最好使用复制池。

您的 PG 降级的原因取决于几件事。如果您想保留当前设置(我不推荐),您可以 post 您的 ceph osd treeceph osd df 开始。