From a91093ab4a1eda9cbf96a46e22b68e6dc13172d9 Mon Sep 17 00:00:00 2001 From: HF Date: Thu, 6 Jul 2023 21:51:49 +0200 Subject: [PATCH] rewrite most of matrixpurge script --- purge/README.md | 62 +++++++-- purge/matrixpurge.sh | 308 ++++++++++++++++++++++++++++++++----------- 2 files changed, 283 insertions(+), 87 deletions(-) diff --git a/purge/README.md b/purge/README.md index a928953..78283cf 100644 --- a/purge/README.md +++ b/purge/README.md @@ -1,17 +1,63 @@ -# Purge old meia +# Clean up postgres synapse database -Shell script that cleans up the matrix postgresql database, -removes push notifications that aren't needed, -kicks out inactive users from rooms, -cleans up states with synapse_auto_compressor and so on. +Shell script that cleans up the matrix postgresql database: + - removes push notifications that aren't needed + - kicks out inactive users from rooms + - cleans up states with synapse_auto_compressor + - deletes messages that are older than 14 days from rooms ## Running -1. Set SQL credentials and URL to local matrix in the script +1. Set SQL credentials, URL and homeserver to local matrix in the script 2. build [synapse_auto_compressor](https://github.com/matrix-org/rust-synapse-compress-state) and set its path in the script 3. make sure that the bridge did start at least once (it creates rooms and adds an admin user that we need) -4. add it as a cron job all 6h or so, like: +4. make sure that you do NOT have a [Message Retention Policy](https://matrix-org.github.io/synapse/latest/message_retention_policies.html) set, because this script does it for you, however, media_retention is still needed. This script will not delete any media. +5. add it as a cron job, like: ``` -0 */6 * * * root /etc/matrix-synapse/matrixpurge.sh +0 2,8,14,23 * * * root /etc/matrix-synapse/matrixpurge.sh +12 11 * * 0 root /etc/matrix-synapse/matrixpurge.sh reset +``` + +The `"reset"` argument is for resetting the synapse_auto_compressor, it shouldn't be run often, but might come in hany if the compressor ends up in a weird state: + +``` +/etc/matrix-synapse/matrixpurge.sh reset +``` + +## Further resources + +- [Shrink Synapse Database](https://levans.fr/shrink-synapse-database.html) +- [Message retention policies](https://github.com/matrix-org/synapse/blob/develop/docs/message_retention_policies.md) +- [Purge History API](https://github.com/matrix-org/synapse/blob/develop/docs/message_retention_policies.md) +- [Find unreferenced state groups](https://github.com/erikjohnston/synapse-find-unreferenced-state-groups) +- [matrix-synapse purge_events.py](https://github.com/matrix-org/synapse/blob/develop/synapse/storage/databases/main/purge_events.py) +- [remove traces of rooms from the db](https://github.com/matrix-org/synapse/issues/14539) + +## Useful commands + +check currently active queries: + +```sql +SELECT pid, query, NOW() - query_start AS elapsed FROM pg_stat_activity WHERE query != ''; +``` + +check events of room sorted by time: + +```sql +select content, type, received_ts from events where room_id = '!scTbMproDsaihhGesQ:pixelplanet.fun' and type = 'm.room.message' order by topological_ordering limit 100; +``` + +show tables by size: + +```sql +SELECT nspname || '.' || relname AS "relation", + pg_size_pretty(pg_total_relation_size(c.oid)) AS "total_size" + FROM pg_class c + LEFT JOIN pg_namespace n ON (n.oid = c.relnamespace) + WHERE nspname NOT IN ('pg_catalog', 'information_schema') + AND c.relkind <> 'i' + AND nspname !~ '^pg_toast' + ORDER BY pg_total_relation_size(c.oid) DESC + LIMIT 20; ``` diff --git a/purge/matrixpurge.sh b/purge/matrixpurge.sh index d99b8c3..71b5f3f 100755 --- a/purge/matrixpurge.sh +++ b/purge/matrixpurge.sh @@ -1,7 +1,6 @@ #!/bin/sh # Do various clean-up tasks in matrix postgresql database # run as cron job all 6h or so -# run with argument reset every month or every week # URL to connect to matrix MATRIXURL="http://localhost:8008" @@ -9,106 +8,257 @@ MATRIXURL="http://localhost:8008" SQLUSER=synapse SQLPASSWD=password SQLDB=synapse -# path to synapse_auto_compressor binary +# path to synapse_auto_compressor binary (you gotta download that) # see https://github.com/matrix-org/rust-synapse-compress-state SYNAPSE_COMPRESSOR_PATH="/etc/matrix-synapse/synapse_auto_compressor" - - # prefix for bridge users and rooms (hardcoded in bridge) PREFIX="pp" +# homeserver +HOMESERVER="pixelplanet.fun" # admin user of bridge channels # (bridge creates him automatically, just make sure to run it at least once before running this script) -ADMINID="@${PREFIX}_admin:pixelplanet.fun" +ADMINID="@${PREFIX}_admin:${HOMESERVER}" + +# path to synapse-find-unreferenced-state-groups +# see https://github.com/erikjohnston/synapse-find-unreferenced-state-groups +# (not needed if you will never use the "clean_states" argument) +SYNAPSE_UNREFERENCED_STATES="/etc/matrix-synapse/rust-synapse-find-unreferenced-state-groups" + +# ANSI color codes +R='\033[0;31m' #'0;31' is Red's ANSI color code +G='\033[0;32m' #'0;32' is Green's ANSI color code +Y='\033[1;32m' #'1;32' is Yellow's ANSI color code +B='\033[0;34m' #'0;34' is Blue's ANSI color code +L='\033[0;36m' #'0;34' is Blue's ANSI color code +NC='\033[0m' +PIDFILE="/var/run/matrixpurge.pid" echo "----------CLEANING UP POSTGRESQL MATRIX DATABASE------------" cd /var/lib/postgresql -echo "--Get token for admin user" +# get admin token for matrix-synapse, creating a new one if needed +get_admin_token () { + echo "--Get token for admin user" + TOKEN=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select token from access_tokens where user_id = '${ADMINID}' and device_id = 'SQLCLEANER' limit 1;" | xargs` + if [ -z ${TOKEN} ]; then + echo "Non exists, generating new Token..." + TOKEN=`cat /proc/sys/kernel/random/uuid` + TOKENID=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select max(id) + 1 from access_tokens"` + psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "insert into access_tokens(id, user_id, token, device_id, last_validated, used) values (${TOKENID}, '${ADMINID}', '${TOKEN}', 'SQLCLEANER', 1656788062940, 'f')" + fi +} -TOKEN=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select token from access_tokens where user_id = '${ADMINID}' and device_id = 'SQLCLEANER' limit 1;" | xargs` -if [ -z ${TOKEN} ] -then - echo "Non exists, generating new Token..." - TOKEN=`cat /proc/sys/kernel/random/uuid` - TOKENID=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select max(id) + 1 from access_tokens"` - psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "insert into access_tokens(id, user_id, token, device_id, last_validated, used) values (${TOKENID}, '${ADMINID}', '${TOKEN}', 'SQLCLEANER', 1656788062940, 'f')" -fi +# purge room +purge_room () { + ROOM="${1}" + D_IDRET=`curl --silent --max-time 900 --insecure -XDELETE -H "Authorization: Bearer ${TOKEN}" -H "Content-Type: application/json" -d "{\"block\": false, \"purge\": true}" "${MATRIXURL}/_synapse/admin/v2/rooms/${ROOM}"` + D_ID=`echo "${D_IDRET}" | jq -r '.delete_id'` + if [ "${D_ID}" = "null" ]; then + echo "${D_IDRET}" | jq -r '.error' + return 1 + fi + printf "Waiting for deletion ${D_ID} for ${ROOM}" + D_STATUS="purging" + D_STATUSRET="" + sleep 4 + while [ "${D_STATUS}" = "purging" ]; do + D_STATUSRET=`curl --silent --max-time 900 --insecure -XGET -H "Authorization: Bearer ${TOKEN}" "${MATRIXURL}/_synapse/admin/v2/rooms/delete_status/${D_ID}"` + D_STATUS=`echo ${D_STATUSRET} | jq -r '.status'` + printf "." + sleep 5 + done + printf "\nPurge finished ${D_STATUSRET}\n" +} -[ "${1}" = "reset" ] && { - # reset state_auto_compressor - echo "-- Reset rust-synapse-compress-state" +# purge events in a room - we do that here instead of per auto_retention, +# because it allows us to time it +purge_room_history () { + ROOM="${1}" + # ms timestamp of 30 days ago + #TS=$((`date +%s%3N` - 2592000000)) + # ms timestamp of 14 days ago + TS=$((`date +%s%3N` - 1209600000)) + P_IDRET=`curl --silent --max-time 900 --insecure -XPOST -H "Authorization: Bearer ${TOKEN}" -H "Content-Type: application/json" -d "{\"delete_local_events\": true, \"purge_up_to_ts\": ${TS}}" "${MATRIXURL}/_synapse/admin/v1/purge_history/${ROOM}"` + P_ID=`echo "${P_IDRET}" | jq -r '.purge_id'` + if [ "${P_ID}" = "null" ]; then + echo "${P_IDRET}" | jq -r '.error' + return 1 + fi + printf "Waiting for purge ${P_ID} for ${ROOM}" + P_STATUS="active" + P_STATUSRET="" + sleep 4 + while [ "${P_STATUS}" = "active" ]; do + P_STATUSRET=`curl --max-time 900 --silent --insecure -XGET -H "Authorization: Bearer ${TOKEN}" "${MATRIXURL}/_synapse/admin/v1/purge_history_status/${P_ID}"` + P_STATUS=`echo ${P_STATUSRET} | jq -r '.status'` + printf "." + sleep 5 + done + printf "\nPurge finished with ${P_STATUSRET}\n" + # Remember that if you delete a room, it's state groups might still be around + # https://github.com/erikjohnston/synapse-find-unreferenced-state-groups +} + +# do stuff for each room on server +check_rooms () { + echo "-- Checking individual rooms" + NEXT_BATCH=0 + while [ "${NEXT_BATCH}" != "null" ]; do + RET=`curl --max-time 900 --silent --insecure -XGET -H "Authorization: Bearer ${TOKEN}" "${MATRIXURL}/_synapse/admin/v1/rooms?order_by=joined_members&from=${NEXT_BATCH}"` + echo "$RET" | jq -c '.rooms[]' | while read room + do + amount=$(echo "$room" | jq -r .joined_members) + amount_local=$(echo "$room" | jq -r .joined_local_members) + id=$(echo "$room" | jq -r .room_id) + server=$(echo "$id" | sed -e 's/[^:]*://') + printf "=> ${L}${id} = ${G}${amount},${amount_local} ${R}${server}${NC}\n" + if [ "${server}" != "${HOMESERVER}" ] && [ "${amount_local}" = "0" ]; then + echo "Room ${id} is federated, but has no local users in it, purging..." + purge_room ${id} + elif [ "${amount}" = "0" ]; then + echo "Room ${id} has no members, purging..." + purge_room ${id} + elif [ ${amount} > 10 ]; then + #randomly decide if purge should happen or not + [ `date +%1N` -lt 4 ] && { + echo "Room ${id} with >10 users, magic decided to delete history of >14d ago..." + purge_room_history ${id} + } + fi + done + NEXT_BATCH=`echo "$RET" | jq -r '.next_batch'` + done +} + +# check unreferenced state groups for 20 largest rooms +check_state_groups () { + files="${1}" + echo "--- Checking largest room state groups" + [ -f "${files}" ] && rm "${files}" + [ -f "/tmp/sgstmp.txt" ] && rm "/tmp/sgstmp.txt" + RET=`curl --max-time 900 --silent --insecure -XGET -H "Authorization: Bearer ${TOKEN}" "${MATRIXURL}/_synapse/admin/v1/rooms?order_by=joined_members&limit=20"` + echo "$RET" | jq -c '.rooms[]' | while read room + do + id=$(echo "$room" | jq -r .room_id) + printf "=> ${L}${id}${NC}\n" + nice -n 10 ${SYNAPSE_UNREFERENCED_STATES} -p postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -r "${id}" -o "/tmp/sgstmp.txt" + [ -f "/tmp/sgstmp.txt" ] && { + cat "/tmp/sgstmp.txt" >> "${files}" + rm "/tmp/sgstmp.txt" + } + done +} + +# clear unreferenced state groups +clear_state_groups () { + echo "--Clearing unreferenced state groups" + check_state_groups "/tmp/sgs.txt" + sleep 10 + check_state_groups "/tmp/sgs2.txt" + diff "/tmp/sgs.txt" "/tmp/sgs2.txt" > /dev/null || { + echo "State groups changed while checking, exiting..." + exit 1 + } + rm "/tmp/sgs2.txt" + # can't run this while synapse is running + systemctl stop matrix-synapse + psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "CREATE TEMPORARY TABLE unreffed(id BIGINT PRIMARY KEY); COPY unreffed FROM stdin WITH (FORMAT 'csv'); DELETE FROM state_groups_state WHERE state_group IN (SELECT id FROM unreffed); DELETE FROM state_group_edges WHERE state_group IN (SELECT id FROM unreffed); DELETE FROM state_groups WHERE id IN (SELECT id FROM unreffed);" < /tmp/sgs.txt + rm "/tmp/sgs.txt" + psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "REINDEX (VERBOSE) DATABASE ${SQLDB}" + systemctl start matrix-synapse +} + +# get all appsocket users from a room that did not send any message in the past 30 days, +# and print a curl config to kick them +get_curl_config () { + room=${1} + ASUSERS=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select user_id from users_in_public_rooms u where room_id = '${room}' and user_id like '@${PREFIX}_%:pixelplanet.fun' and user_id != '${ADMINID}' and not exists ( select from events where room_id = '${room}' and sender = u.user_id and to_timestamp(received_ts/1000) > now() - interval '30 DAYS' ) and not exists ( select from user_ips where user_id = u.user_id )"` + KICKURL="${MATRIXURL}/_matrix/client/v3/rooms/${room}/kick" + CNT=1 + for user in ${ASUSERS}; do + if [ ${CNT} -eq 1 ]; then + CNT=0 + else + echo "next" + fi + echo "header=\"Authorization: Bearer ${TOKEN}\"" + echo "header=\"Content-Type: application/json\"" + echo "data=\"{\\\"user_id\\\": \\\"${user}\\\"}\"" + echo "url=${KICKURL}" + done + return ${CNT} +} + +# Do stuff for appsocket rooms +check_api_rooms () { + ROOMS=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select room_id from room_aliases where room_alias like '#${PREFIX}_%'"` + for room in ${ROOMS} + do + echo "=> Delete event_push_actions of not logged-in users from romm ${room} and kick out inactive users..." + # Clean event_push_actions of not-logged-in application service users + # see https://github.com/matrix-org/synapse/issues/5569 + # This command can be really slow, if it takes too long, remove the "not exists..." part and it won't care about if logged in or not + psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "delete from event_push_actions u where room_id = '${room}' and user_id like '@${PREFIX}_%:pixelplanet.fun' and not exists ( select from user_ips where user_id = u.user_id and to_timestamp(last_seen/1000) > now() - interval '2 days' )" + get_curl_config "${room}" > /tmp/curlkick.tmp && curl --silent --parallel --parallel-immediate --parallel-max 10 --config /tmp/curlkick.tmp && echo "" + rm /tmp/curlkick.tmp + done +} + +# disable ratelimit on admin API +disable_ratelimit () { + echo "--Disabling ratelimit for admin user ${ADMINID}..." + RATEURL="${MATRIXURL}/_synapse/admin/v1/users/${ADMINID}/override_ratelimit" + curl --silent --insecure -XPOST -H "Authorization: Bearer ${TOKEN}" -H "Content-Type: application/json" -d "{\"messages_per_second\": 0}" ${RATEURL} +} + +# reset state compressor if requested +reset_state_compressor () { + echo "--Reset rust-synapse-compress-state" psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "drop table state_compressor_state; drop table state_compressor_progress; drop table state_compressor_total_progress;" } -# purge events in a room - just for reference - ignore this block -# TODO provide a function that purges a room and also its state_groups -#ROOM="scTbMproDsaihhGesQ:pixelplanet.fun" -#DELURL="${MATRIXURL}/_synapse/admin/v1/purge_history/!${ROOM}" -#curl --insecure -XPOST -H "Authorization: Bearer ${TOKEN}" -H "Content-Type: application/json" -d "{\"delete_local_events\": true, \"purge_up_to_ts\": 1660340843343}" ${DELURL} -# and its status check -#DELURL="${MATRIXURL}/_synapse/admin/v1/purge_history_status/MtGGeIGbsYTqdCUF" -#curl --insecure -XGET -H "Authorization: Bearer ${TOKEN}" ${DELURL} -# Remember that if you delete a room, it's state groups are still around -# https://github.com/erikjohnston/synapse-find-unreferenced-state-groups -#exit -# ----------------------------------------------------------------- +compress_state () { + echo "--Compress states..." + # https://github.com/matrix-org/rust-synapse-compress-state + nice -n 10 ${SYNAPSE_COMPRESSOR_PATH} -p postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c 500 -n 100 +} -# Disable ratelimit -echo "--Disabling ratelimit for admin user ${ADMINID}..." -RATEURL="${MATRIXURL}/_synapse/admin/v1/users/${ADMINID}/override_ratelimit" -curl --insecure -XPOST -H "Authorization: Bearer ${TOKEN}" -H "Content-Type: application/json" -d "{\"messages_per_second\": 0}" ${RATEURL} +clean_db () { + echo "--Clean up cache_invalidation_stream_by_instance" + # see https://github.com/matrix-org/synapse/issues/8269 + time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "delete from cache_invalidation_stream_by_instance where to_timestamp(invalidation_ts/1000) > now() - interval '1 months';" + echo "--Vaccum..." + time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "VACUUM FULL" +} -ROOMS=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select room_id from room_aliases where room_alias like '#${PREFIX}_%'"` - - -get_curl_config () { - room=${1} - # Get all appsockets users from a public room that did not send any message in the past 48h - ASUSERS=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select user_id from users_in_public_rooms u where room_id = '${room}' and user_id like '@${PREFIX}_%:pixelplanet.fun' and user_id != '${ADMINID}' and not exists ( select from events where room_id = '${room}' and sender = u.user_id and to_timestamp(received_ts/1000) > now() - interval '4 DAYS' ) and not exists ( select from user_ips where user_id = u.user_id )"` - KICKURL="${MATRIXURL}/_matrix/client/v3/rooms/${room}/kick" - CNT=1 - for user in ${ASUSERS} - do - if [ ${CNT} -eq 1 ] - then - CNT=0 - else - echo "next" - fi - echo "header=\"Authorization: Bearer ${TOKEN}\"" - echo "header=\"Content-Type: application/json\"" - echo "data=\"{\\\"user_id\\\": \\\"${user}\\\"}\"" - echo "url=${KICKURL}" - done - return ${CNT} +print_stats () { + echo "--DONE. Current database size is..." + time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "SELECT pg_size_pretty( pg_database_size( 'synapse' ) )" + [ -n "${BASH_VERSION}" ] && set +H + psql postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "SELECT nspname || '.' || relname AS \"relation\", pg_size_pretty(pg_total_relation_size(c.oid)) AS \"total_size\" FROM pg_class c LEFT JOIN pg_namespace n ON (n.oid = c.relnamespace) WHERE nspname NOT IN ('pg_catalog', 'information_schema') AND c.relkind <> 'i' AND nspname !~ '^pg_toast' ORDER BY pg_total_relation_size(c.oid) DESC LIMIT 20;" } -for room in ${ROOMS} -do - echo "--Delete event_push_actions of not logged-in users from romm ${room}..." - # Clean event_push_actions of not-logged-in application service users - # see https://github.com/matrix-org/synapse/issues/5569 - # This command can be really slow, if it takes too long, remove the "not exists..." part and it won't care about if logged in or not - time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "delete from event_push_actions u where room_id = '${room}' and user_id like '@${PREFIX}_%:pixelplanet.fun' and not exists ( select from user_ips where user_id = u.user_id and to_timestamp(last_seen/1000) > now() - interval '2 days' )" - echo "--Kick out inactive users from room ${room}..." - get_curl_config "${room}" > /tmp/curlkick.tmp && curl --parallel --parallel-immediate --parallel-max 10 --config /tmp/curlkick.tmp && echo "" - rm /tmp/curlkick.tmp -done +[ -f "${PIDFILE}" ] && ps -p `cat "${PIDFILE}"` > /dev/null && { + echo "matrixpurge.sh already running, exiting." + exit 1 +} +echo $$ > "${PIDFILE}" -echo "--Clean up cache_invalidation_stream_by_instance" -# see https://github.com/matrix-org/synapse/issues/8269 -time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "delete from cache_invalidation_stream_by_instance where to_timestamp(invalidation_ts/1000) > now() - interval '1 months';" -echo "--Compress states..." -# https://github.com/matrix-org/rust-synapse-compress-state -${SYNAPSE_COMPRESSOR_PATH} -p postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c 500 -n 100 -echo "--Vaccum..." -time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "VACUUM FULL VERBOSE" -echo "--DONE. Current database size is..." -time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "SELECT pg_size_pretty( pg_database_size( 'synapse' ) )" +get_admin_token -[ -n "${BASH_VERSION}" ] && set +H -psql postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "SELECT nspname || '.' || relname AS \"relation\", pg_size_pretty(pg_total_relation_size(c.oid)) AS \"total_size\" FROM pg_class c LEFT JOIN pg_namespace n ON (n.oid = c.relnamespace) WHERE nspname NOT IN ('pg_catalog', 'information_schema') AND c.relkind <> 'i' AND nspname !~ '^pg_toast' ORDER BY pg_total_relation_size(c.oid) DESC LIMIT 20;" +[ "${1}" = "clean_states" ] && { + clear_state_groups + exit 0 +} +check_rooms +disable_ratelimit +check_api_rooms +[ "${1}" = "reset" ] && reset_state_compressor +compress_state +clean_db +print_stats +rm "${PIDFILE}"