Ceph SSD Tiering

Published by Andrea on

I’ve tried to insert an SSD tier on a 3 node ceph block storage to check the performance boost.
The landscape is composed by 6 CentOS VM, 1 admin, 1 monitor+dashboard, 3 osd and 1 rbd client used as nfs gateway with kernel module, on a 6 node vSphere 6 cluster.



The write performance on our Ceph with spinning disk of an ESX copy file is about 70/80 MB/s

I’ve added a 64GB SSD virtual disk to all 3 OSD node and after I’ve checked the list of all the bus with this commands

sudo –i

ls /sys/class/scsi_host/

Executing the following commands I’ve rescanned all the bus finded before

echo “- – -” > /sys/class/scsi_host/host0/scan
echo “- – -” > /sys/class/scsi_host/host1/scan
….
echo “- – -” > /sys/class/scsi_host/host31/scan
echo “- – -” > /sys/class/scsi_host/host32/scan

Now with a new drive in each osd node and I can find the associated device with

fdisk –l

The output is this

Disk /dev/sdd: 68.7 GB, 68719476736 bytes, 134217728 sectors
Units = sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes

I’ve prepared the disks and added all of it to the cluster

ceph-deploy disk zap rmosd1:/dev/sdd rmosd2:/dev/sdd rmosd3:/dev/sdd
ceph-deploy osd prepare rmosd1:/dev/sdd rmosd2:/dev/sdd rmosd3:/dev/sdd
ceph-deploy osd activate rmosd1:/dev/sdd1:/dev/sdd2 rmosd2:/dev/sdd1:/dev/sdd2 rmosd3:/dev/sdd1:/dev/sdd2

Now some data will be moved to the new disks, I’ve done it to try the scenario of increasing capacity adding new disks.

From the monitor we can modify the crush map to separate spindle from ssd, I’ve started downloading the map

cd /etc/ceph
ceph osd getcrushmap -o /etc/ceph/compiled-crushmap

Decompiling it

crushtool -d /etc/ceph/compiled-crushmap -o /etc/ceph/decompiled-crushmap

Updating it from the Original Version

sudo vi /etc/ceph/decompiled-crushmap

# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable straw_calc_version 1
# devices
device 0 osd.0
device 1 osd.1
device 2 osd.2
device 3 osd.3
device 4 osd.4
device 5 osd.5
# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 region
type 10 root
# buckets
host rmosd1 {
id -2 # do not change unnecessarily
# weight 0.557
alg straw
hash 0 # rjenkins1
item osd.0 weight 0.500
item osd.3 weight 0.058
}
host rmosd2 {
id -3 # do not change unnecessarily
# weight 0.557
alg straw
hash 0 # rjenkins1
item osd.1 weight 0.500
item osd.4 weight 0.058
}
host rmosd3 {
id -4 # do not change unnecessarily
# weight 0.557
alg straw
hash 0 # rjenkins1
item osd.2 weight 0.500
item osd.5 weight 0.058
}
root default {
id -1 # do not change unnecessarily
# weight 1.672
alg straw
hash 0 # rjenkins1
item rmosd1 weight 0.557
item rmosd2 weight 0.557
item rmosd3 weight 0.557
}
# rules
rule replicated_ruleset {
ruleset 0
type replicated
min_size 1
max_size 10
step take default
step chooseleaf firstn 0 type host
step emit
}
# end crush map

To the New Version with a new layer disktype:

# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable straw_calc_version 1
# devices
device 0 osd.0
device 1 osd.1
device 2 osd.2
device 3 osd.3
device 4 osd.4
device 5 osd.5
# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 region
type 10 root
type 11 disktype
# buckets
disktype rmosd1_ssd {
id -5 # do not change unnecessarily
# weight 0.058
alg straw
hash 0 # rjenkins1
item osd.3 weight 0.058
}
disktype rmosd1_spinning {
id -6 # do not change unnecessarily
# weight 0.500
alg straw
hash 0 # rjenkins1
item osd.0 weight 0.500
}
host rmosd1 {
id -2 # do not change unnecessarily
# weight 0.557
alg straw
hash 0 # rjenkins1
item rmosd1_ssd weight 0.500
item rmosd1_spinning weight 0.058
}
disktype rmosd2_ssd {
id -7 # do not change unnecessarily
# weight 0.058
alg straw
hash 0 # rjenkins1
item osd.4 weight 0.058
}
disktype rmosd2_spinning {
id -8 # do not change unnecessarily
# weight 0.500
alg straw
hash 0 # rjenkins1
item osd.1 weight 0.500
}
host rmosd2 {
id -3 # do not change unnecessarily
# weight 0.557
alg straw
hash 0 # rjenkins1
item rmosd2_ssd weight 0.500
item rmosd2_spinning weight 0.058
}
disktype rmosd3_ssd {
id -9 # do not change unnecessarily
# weight 0.058
alg straw
hash 0 # rjenkins1
item osd.5 weight 0.058
}
disktype rmosd3_spinning {
id -10 # do not change unnecessarily
# weight 0.500
alg straw
hash 0 # rjenkins1
item osd.2 weight 0.500
}
host rmosd3 {
id -4 # do not change unnecessarily
# weight 0.557
alg straw
hash 0 # rjenkins1
item rmosd3_ssd weight 0.500
item rmosd3_spinning weight 0.058
}
root default {
id -1 # do not change unnecessarily
# weight 1.672
alg straw
hash 0 # rjenkins1
item rmosd1 weight 0.557
item rmosd2 weight 0.557
item rmosd3 weight 0.557
}
root spinning {
id -11 # do not change unnecessarily
# weight 1.500
alg straw
hash 0 # rjenkins1
item rmosd1_spinning weight 0.500
item rmosd2_spinning weight 0.500
item rmosd3_spinning weight 0.500
}
root ssd {
id -12 # do not change unnecessarily
# weight 0.174
alg straw
hash 0 # rjenkins1
item rmosd1_ssd weight 0.058
item rmosd2_ssd weight 0.058
item rmosd3_ssd weight 0.058
}
# rules
rule replicated_ruleset {
ruleset 0
type replicated
min_size 1
max_size 10
step take spinning
step chooseleaf firstn 0 type disktype
step emit
}
rule spinning {
ruleset 1
type erasure
min_size 3
max_size 20
step set_chooseleaf_tries 5
step take spinning
step chooseleaf indep 0 type osd
step emit
}
rule ssd {
ruleset 2
type replicated
min_size 1
max_size 10
step take ssd
step chooseleaf firstn 0 type disktype
step emit
}
# end crush map

I’ve compiled the new map
crushtool –c /etc/ceph/decompiled-crushmap –o /etc/ceph/new-compiled-crushmap
And updated the crush map in the cluster
ceph osd setcrushmap -i /etc/ceph/new-compiled-crushmap
Now the data was moved away from the SSD disk of the default pool

I’ve created two pools, one ssd (hot) and one spindle (cold)
ceph osd pool create hot-storage 100 100 replicated replicated_ruleset
ceph osd pool create cold-storage 100 100 replicated replicated_ruleset
ceph osd pool set hot-storage crush_ruleset 2

I’ve added the hot-storage as tiering of the cold-storage pool
ceph osd tier add cold-storage hot-storage

I’ve set the cache in writeback and redirected the client traffic to the hot cache pool with the bloom cache algorithm
ceph osd tier cache-mode hot-storage writeback
ceph osd tier set-overlay cold-storage hot-storage
ceph osd pool set hot-storage hit_set_type bloom

From the client I have created a new image of 100GB on the cold-storage pool
sudo mkdir -p /cephfs-cache
sudo rbd create cold-storage/nfsdata-cache –size 100G
sudo rbd feature disable cold-storage/nfsdata-cache fast-diff object-map deep-flatten exclusive-lock
sudo rbd map cold-storage/nfsdata-cache
rbd showmapped
sudo mkfs.xfs /dev/rbd0
sudo mount /dev/rbd0 /cephfs-cache
Tested the disk mount
Df –hT
I’ve added an NFS export of the new folder and added a new datastore on the VMware cluster.
I’ve started the same copy and, from monitor server, check that the cache is working properly with the command
Sudo ceph osd pool stats
The space is consumed in the Hot-Storage pool instead of Cold-Storage
Sudo rados df

From the dashboard I can check the performance

The copy now needs 18:17 instead of 25:15 of pure spinning disk with almost double the performance of before.
After I’ve tried to reconfigure the SSD as journaling and it seems to have the same performance as for SSD cache tiering.


0 Comments

Leave a Reply

This site uses Akismet to reduce spam. Learn how your comment data is processed.