rewrite most of matrixpurge script

This commit is contained in:
HF 2023-07-06 21:51:49 +02:00
parent 986e0b5d76
commit a91093ab4a
2 changed files with 283 additions and 87 deletions

View File

@ -1,17 +1,63 @@
# Purge old meia
# Clean up postgres synapse database
Shell script that cleans up the matrix postgresql database,
removes push notifications that aren't needed,
kicks out inactive users from rooms,
cleans up states with synapse_auto_compressor and so on.
Shell script that cleans up the matrix postgresql database:
- removes push notifications that aren't needed
- kicks out inactive users from rooms
- cleans up states with synapse_auto_compressor
- deletes messages that are older than 14 days from rooms
## Running
1. Set SQL credentials and URL to local matrix in the script
1. Set SQL credentials, URL and homeserver to local matrix in the script
2. build [synapse_auto_compressor](https://github.com/matrix-org/rust-synapse-compress-state) and set its path in the script
3. make sure that the bridge did start at least once (it creates rooms and adds an admin user that we need)
4. add it as a cron job all 6h or so, like:
4. make sure that you do NOT have a [Message Retention Policy](https://matrix-org.github.io/synapse/latest/message_retention_policies.html) set, because this script does it for you, however, media_retention is still needed. This script will not delete any media.
5. add it as a cron job, like:
```
0 */6 * * * root /etc/matrix-synapse/matrixpurge.sh
0 2,8,14,23 * * * root /etc/matrix-synapse/matrixpurge.sh
12 11 * * 0 root /etc/matrix-synapse/matrixpurge.sh reset
```
The `"reset"` argument is for resetting the synapse_auto_compressor, it shouldn't be run often, but might come in hany if the compressor ends up in a weird state:
```
/etc/matrix-synapse/matrixpurge.sh reset
```
## Further resources
- [Shrink Synapse Database](https://levans.fr/shrink-synapse-database.html)
- [Message retention policies](https://github.com/matrix-org/synapse/blob/develop/docs/message_retention_policies.md)
- [Purge History API](https://github.com/matrix-org/synapse/blob/develop/docs/message_retention_policies.md)
- [Find unreferenced state groups](https://github.com/erikjohnston/synapse-find-unreferenced-state-groups)
- [matrix-synapse purge_events.py](https://github.com/matrix-org/synapse/blob/develop/synapse/storage/databases/main/purge_events.py)
- [remove traces of rooms from the db](https://github.com/matrix-org/synapse/issues/14539)
## Useful commands
check currently active queries:
```sql
SELECT pid, query, NOW() - query_start AS elapsed FROM pg_stat_activity WHERE query != '<IDLE>';
```
check events of room sorted by time:
```sql
select content, type, received_ts from events where room_id = '!scTbMproDsaihhGesQ:pixelplanet.fun' and type = 'm.room.message' order by topological_ordering limit 100;
```
show tables by size:
```sql
SELECT nspname || '.' || relname AS "relation",
pg_size_pretty(pg_total_relation_size(c.oid)) AS "total_size"
FROM pg_class c
LEFT JOIN pg_namespace n ON (n.oid = c.relnamespace)
WHERE nspname NOT IN ('pg_catalog', 'information_schema')
AND c.relkind <> 'i'
AND nspname !~ '^pg_toast'
ORDER BY pg_total_relation_size(c.oid) DESC
LIMIT 20;
```

View File

@ -1,7 +1,6 @@
#!/bin/sh
# Do various clean-up tasks in matrix postgresql database
# run as cron job all 6h or so
# run with argument reset every month or every week
# URL to connect to matrix
MATRIXURL="http://localhost:8008"
@ -9,106 +8,257 @@ MATRIXURL="http://localhost:8008"
SQLUSER=synapse
SQLPASSWD=password
SQLDB=synapse
# path to synapse_auto_compressor binary
# path to synapse_auto_compressor binary (you gotta download that)
# see https://github.com/matrix-org/rust-synapse-compress-state
SYNAPSE_COMPRESSOR_PATH="/etc/matrix-synapse/synapse_auto_compressor"
# prefix for bridge users and rooms (hardcoded in bridge)
PREFIX="pp"
# homeserver
HOMESERVER="pixelplanet.fun"
# admin user of bridge channels
# (bridge creates him automatically, just make sure to run it at least once before running this script)
ADMINID="@${PREFIX}_admin:pixelplanet.fun"
ADMINID="@${PREFIX}_admin:${HOMESERVER}"
# path to synapse-find-unreferenced-state-groups
# see https://github.com/erikjohnston/synapse-find-unreferenced-state-groups
# (not needed if you will never use the "clean_states" argument)
SYNAPSE_UNREFERENCED_STATES="/etc/matrix-synapse/rust-synapse-find-unreferenced-state-groups"
# ANSI color codes
R='\033[0;31m' #'0;31' is Red's ANSI color code
G='\033[0;32m' #'0;32' is Green's ANSI color code
Y='\033[1;32m' #'1;32' is Yellow's ANSI color code
B='\033[0;34m' #'0;34' is Blue's ANSI color code
L='\033[0;36m' #'0;34' is Blue's ANSI color code
NC='\033[0m'
PIDFILE="/var/run/matrixpurge.pid"
echo "----------CLEANING UP POSTGRESQL MATRIX DATABASE------------"
cd /var/lib/postgresql
echo "--Get token for admin user"
# get admin token for matrix-synapse, creating a new one if needed
get_admin_token () {
echo "--Get token for admin user"
TOKEN=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select token from access_tokens where user_id = '${ADMINID}' and device_id = 'SQLCLEANER' limit 1;" | xargs`
if [ -z ${TOKEN} ]; then
echo "Non exists, generating new Token..."
TOKEN=`cat /proc/sys/kernel/random/uuid`
TOKENID=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select max(id) + 1 from access_tokens"`
psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "insert into access_tokens(id, user_id, token, device_id, last_validated, used) values (${TOKENID}, '${ADMINID}', '${TOKEN}', 'SQLCLEANER', 1656788062940, 'f')"
fi
}
TOKEN=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select token from access_tokens where user_id = '${ADMINID}' and device_id = 'SQLCLEANER' limit 1;" | xargs`
if [ -z ${TOKEN} ]
then
echo "Non exists, generating new Token..."
TOKEN=`cat /proc/sys/kernel/random/uuid`
TOKENID=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select max(id) + 1 from access_tokens"`
psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "insert into access_tokens(id, user_id, token, device_id, last_validated, used) values (${TOKENID}, '${ADMINID}', '${TOKEN}', 'SQLCLEANER', 1656788062940, 'f')"
fi
# purge room
purge_room () {
ROOM="${1}"
D_IDRET=`curl --silent --max-time 900 --insecure -XDELETE -H "Authorization: Bearer ${TOKEN}" -H "Content-Type: application/json" -d "{\"block\": false, \"purge\": true}" "${MATRIXURL}/_synapse/admin/v2/rooms/${ROOM}"`
D_ID=`echo "${D_IDRET}" | jq -r '.delete_id'`
if [ "${D_ID}" = "null" ]; then
echo "${D_IDRET}" | jq -r '.error'
return 1
fi
printf "Waiting for deletion ${D_ID} for ${ROOM}"
D_STATUS="purging"
D_STATUSRET=""
sleep 4
while [ "${D_STATUS}" = "purging" ]; do
D_STATUSRET=`curl --silent --max-time 900 --insecure -XGET -H "Authorization: Bearer ${TOKEN}" "${MATRIXURL}/_synapse/admin/v2/rooms/delete_status/${D_ID}"`
D_STATUS=`echo ${D_STATUSRET} | jq -r '.status'`
printf "."
sleep 5
done
printf "\nPurge finished ${D_STATUSRET}\n"
}
[ "${1}" = "reset" ] && {
# reset state_auto_compressor
echo "-- Reset rust-synapse-compress-state"
# purge events in a room - we do that here instead of per auto_retention,
# because it allows us to time it
purge_room_history () {
ROOM="${1}"
# ms timestamp of 30 days ago
#TS=$((`date +%s%3N` - 2592000000))
# ms timestamp of 14 days ago
TS=$((`date +%s%3N` - 1209600000))
P_IDRET=`curl --silent --max-time 900 --insecure -XPOST -H "Authorization: Bearer ${TOKEN}" -H "Content-Type: application/json" -d "{\"delete_local_events\": true, \"purge_up_to_ts\": ${TS}}" "${MATRIXURL}/_synapse/admin/v1/purge_history/${ROOM}"`
P_ID=`echo "${P_IDRET}" | jq -r '.purge_id'`
if [ "${P_ID}" = "null" ]; then
echo "${P_IDRET}" | jq -r '.error'
return 1
fi
printf "Waiting for purge ${P_ID} for ${ROOM}"
P_STATUS="active"
P_STATUSRET=""
sleep 4
while [ "${P_STATUS}" = "active" ]; do
P_STATUSRET=`curl --max-time 900 --silent --insecure -XGET -H "Authorization: Bearer ${TOKEN}" "${MATRIXURL}/_synapse/admin/v1/purge_history_status/${P_ID}"`
P_STATUS=`echo ${P_STATUSRET} | jq -r '.status'`
printf "."
sleep 5
done
printf "\nPurge finished with ${P_STATUSRET}\n"
# Remember that if you delete a room, it's state groups might still be around
# https://github.com/erikjohnston/synapse-find-unreferenced-state-groups
}
# do stuff for each room on server
check_rooms () {
echo "-- Checking individual rooms"
NEXT_BATCH=0
while [ "${NEXT_BATCH}" != "null" ]; do
RET=`curl --max-time 900 --silent --insecure -XGET -H "Authorization: Bearer ${TOKEN}" "${MATRIXURL}/_synapse/admin/v1/rooms?order_by=joined_members&from=${NEXT_BATCH}"`
echo "$RET" | jq -c '.rooms[]' | while read room
do
amount=$(echo "$room" | jq -r .joined_members)
amount_local=$(echo "$room" | jq -r .joined_local_members)
id=$(echo "$room" | jq -r .room_id)
server=$(echo "$id" | sed -e 's/[^:]*://')
printf "=> ${L}${id} = ${G}${amount},${amount_local} ${R}${server}${NC}\n"
if [ "${server}" != "${HOMESERVER}" ] && [ "${amount_local}" = "0" ]; then
echo "Room ${id} is federated, but has no local users in it, purging..."
purge_room ${id}
elif [ "${amount}" = "0" ]; then
echo "Room ${id} has no members, purging..."
purge_room ${id}
elif [ ${amount} > 10 ]; then
#randomly decide if purge should happen or not
[ `date +%1N` -lt 4 ] && {
echo "Room ${id} with >10 users, magic decided to delete history of >14d ago..."
purge_room_history ${id}
}
fi
done
NEXT_BATCH=`echo "$RET" | jq -r '.next_batch'`
done
}
# check unreferenced state groups for 20 largest rooms
check_state_groups () {
files="${1}"
echo "--- Checking largest room state groups"
[ -f "${files}" ] && rm "${files}"
[ -f "/tmp/sgstmp.txt" ] && rm "/tmp/sgstmp.txt"
RET=`curl --max-time 900 --silent --insecure -XGET -H "Authorization: Bearer ${TOKEN}" "${MATRIXURL}/_synapse/admin/v1/rooms?order_by=joined_members&limit=20"`
echo "$RET" | jq -c '.rooms[]' | while read room
do
id=$(echo "$room" | jq -r .room_id)
printf "=> ${L}${id}${NC}\n"
nice -n 10 ${SYNAPSE_UNREFERENCED_STATES} -p postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -r "${id}" -o "/tmp/sgstmp.txt"
[ -f "/tmp/sgstmp.txt" ] && {
cat "/tmp/sgstmp.txt" >> "${files}"
rm "/tmp/sgstmp.txt"
}
done
}
# clear unreferenced state groups
clear_state_groups () {
echo "--Clearing unreferenced state groups"
check_state_groups "/tmp/sgs.txt"
sleep 10
check_state_groups "/tmp/sgs2.txt"
diff "/tmp/sgs.txt" "/tmp/sgs2.txt" > /dev/null || {
echo "State groups changed while checking, exiting..."
exit 1
}
rm "/tmp/sgs2.txt"
# can't run this while synapse is running
systemctl stop matrix-synapse
psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "CREATE TEMPORARY TABLE unreffed(id BIGINT PRIMARY KEY); COPY unreffed FROM stdin WITH (FORMAT 'csv'); DELETE FROM state_groups_state WHERE state_group IN (SELECT id FROM unreffed); DELETE FROM state_group_edges WHERE state_group IN (SELECT id FROM unreffed); DELETE FROM state_groups WHERE id IN (SELECT id FROM unreffed);" < /tmp/sgs.txt
rm "/tmp/sgs.txt"
psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "REINDEX (VERBOSE) DATABASE ${SQLDB}"
systemctl start matrix-synapse
}
# get all appsocket users from a room that did not send any message in the past 30 days,
# and print a curl config to kick them
get_curl_config () {
room=${1}
ASUSERS=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select user_id from users_in_public_rooms u where room_id = '${room}' and user_id like '@${PREFIX}_%:pixelplanet.fun' and user_id != '${ADMINID}' and not exists ( select from events where room_id = '${room}' and sender = u.user_id and to_timestamp(received_ts/1000) > now() - interval '30 DAYS' ) and not exists ( select from user_ips where user_id = u.user_id )"`
KICKURL="${MATRIXURL}/_matrix/client/v3/rooms/${room}/kick"
CNT=1
for user in ${ASUSERS}; do
if [ ${CNT} -eq 1 ]; then
CNT=0
else
echo "next"
fi
echo "header=\"Authorization: Bearer ${TOKEN}\""
echo "header=\"Content-Type: application/json\""
echo "data=\"{\\\"user_id\\\": \\\"${user}\\\"}\""
echo "url=${KICKURL}"
done
return ${CNT}
}
# Do stuff for appsocket rooms
check_api_rooms () {
ROOMS=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select room_id from room_aliases where room_alias like '#${PREFIX}_%'"`
for room in ${ROOMS}
do
echo "=> Delete event_push_actions of not logged-in users from romm ${room} and kick out inactive users..."
# Clean event_push_actions of not-logged-in application service users
# see https://github.com/matrix-org/synapse/issues/5569
# This command can be really slow, if it takes too long, remove the "not exists..." part and it won't care about if logged in or not
psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "delete from event_push_actions u where room_id = '${room}' and user_id like '@${PREFIX}_%:pixelplanet.fun' and not exists ( select from user_ips where user_id = u.user_id and to_timestamp(last_seen/1000) > now() - interval '2 days' )"
get_curl_config "${room}" > /tmp/curlkick.tmp && curl --silent --parallel --parallel-immediate --parallel-max 10 --config /tmp/curlkick.tmp && echo ""
rm /tmp/curlkick.tmp
done
}
# disable ratelimit on admin API
disable_ratelimit () {
echo "--Disabling ratelimit for admin user ${ADMINID}..."
RATEURL="${MATRIXURL}/_synapse/admin/v1/users/${ADMINID}/override_ratelimit"
curl --silent --insecure -XPOST -H "Authorization: Bearer ${TOKEN}" -H "Content-Type: application/json" -d "{\"messages_per_second\": 0}" ${RATEURL}
}
# reset state compressor if requested
reset_state_compressor () {
echo "--Reset rust-synapse-compress-state"
psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "drop table state_compressor_state; drop table state_compressor_progress; drop table state_compressor_total_progress;"
}
# purge events in a room - just for reference - ignore this block
# TODO provide a function that purges a room and also its state_groups
#ROOM="scTbMproDsaihhGesQ:pixelplanet.fun"
#DELURL="${MATRIXURL}/_synapse/admin/v1/purge_history/!${ROOM}"
#curl --insecure -XPOST -H "Authorization: Bearer ${TOKEN}" -H "Content-Type: application/json" -d "{\"delete_local_events\": true, \"purge_up_to_ts\": 1660340843343}" ${DELURL}
# and its status check
#DELURL="${MATRIXURL}/_synapse/admin/v1/purge_history_status/MtGGeIGbsYTqdCUF"
#curl --insecure -XGET -H "Authorization: Bearer ${TOKEN}" ${DELURL}
# Remember that if you delete a room, it's state groups are still around
# https://github.com/erikjohnston/synapse-find-unreferenced-state-groups
#exit
# -----------------------------------------------------------------
compress_state () {
echo "--Compress states..."
# https://github.com/matrix-org/rust-synapse-compress-state
nice -n 10 ${SYNAPSE_COMPRESSOR_PATH} -p postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c 500 -n 100
}
# Disable ratelimit
echo "--Disabling ratelimit for admin user ${ADMINID}..."
RATEURL="${MATRIXURL}/_synapse/admin/v1/users/${ADMINID}/override_ratelimit"
curl --insecure -XPOST -H "Authorization: Bearer ${TOKEN}" -H "Content-Type: application/json" -d "{\"messages_per_second\": 0}" ${RATEURL}
clean_db () {
echo "--Clean up cache_invalidation_stream_by_instance"
# see https://github.com/matrix-org/synapse/issues/8269
time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "delete from cache_invalidation_stream_by_instance where to_timestamp(invalidation_ts/1000) > now() - interval '1 months';"
echo "--Vaccum..."
time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "VACUUM FULL"
}
ROOMS=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select room_id from room_aliases where room_alias like '#${PREFIX}_%'"`
get_curl_config () {
room=${1}
# Get all appsockets users from a public room that did not send any message in the past 48h
ASUSERS=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select user_id from users_in_public_rooms u where room_id = '${room}' and user_id like '@${PREFIX}_%:pixelplanet.fun' and user_id != '${ADMINID}' and not exists ( select from events where room_id = '${room}' and sender = u.user_id and to_timestamp(received_ts/1000) > now() - interval '4 DAYS' ) and not exists ( select from user_ips where user_id = u.user_id )"`
KICKURL="${MATRIXURL}/_matrix/client/v3/rooms/${room}/kick"
CNT=1
for user in ${ASUSERS}
do
if [ ${CNT} -eq 1 ]
then
CNT=0
else
echo "next"
fi
echo "header=\"Authorization: Bearer ${TOKEN}\""
echo "header=\"Content-Type: application/json\""
echo "data=\"{\\\"user_id\\\": \\\"${user}\\\"}\""
echo "url=${KICKURL}"
done
return ${CNT}
print_stats () {
echo "--DONE. Current database size is..."
time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "SELECT pg_size_pretty( pg_database_size( 'synapse' ) )"
[ -n "${BASH_VERSION}" ] && set +H
psql postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "SELECT nspname || '.' || relname AS \"relation\", pg_size_pretty(pg_total_relation_size(c.oid)) AS \"total_size\" FROM pg_class c LEFT JOIN pg_namespace n ON (n.oid = c.relnamespace) WHERE nspname NOT IN ('pg_catalog', 'information_schema') AND c.relkind <> 'i' AND nspname !~ '^pg_toast' ORDER BY pg_total_relation_size(c.oid) DESC LIMIT 20;"
}
for room in ${ROOMS}
do
echo "--Delete event_push_actions of not logged-in users from romm ${room}..."
# Clean event_push_actions of not-logged-in application service users
# see https://github.com/matrix-org/synapse/issues/5569
# This command can be really slow, if it takes too long, remove the "not exists..." part and it won't care about if logged in or not
time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "delete from event_push_actions u where room_id = '${room}' and user_id like '@${PREFIX}_%:pixelplanet.fun' and not exists ( select from user_ips where user_id = u.user_id and to_timestamp(last_seen/1000) > now() - interval '2 days' )"
echo "--Kick out inactive users from room ${room}..."
get_curl_config "${room}" > /tmp/curlkick.tmp && curl --parallel --parallel-immediate --parallel-max 10 --config /tmp/curlkick.tmp && echo ""
rm /tmp/curlkick.tmp
done
[ -f "${PIDFILE}" ] && ps -p `cat "${PIDFILE}"` > /dev/null && {
echo "matrixpurge.sh already running, exiting."
exit 1
}
echo $$ > "${PIDFILE}"
echo "--Clean up cache_invalidation_stream_by_instance"
# see https://github.com/matrix-org/synapse/issues/8269
time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "delete from cache_invalidation_stream_by_instance where to_timestamp(invalidation_ts/1000) > now() - interval '1 months';"
echo "--Compress states..."
# https://github.com/matrix-org/rust-synapse-compress-state
${SYNAPSE_COMPRESSOR_PATH} -p postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c 500 -n 100
echo "--Vaccum..."
time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "VACUUM FULL VERBOSE"
echo "--DONE. Current database size is..."
time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "SELECT pg_size_pretty( pg_database_size( 'synapse' ) )"
get_admin_token
[ -n "${BASH_VERSION}" ] && set +H
psql postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "SELECT nspname || '.' || relname AS \"relation\", pg_size_pretty(pg_total_relation_size(c.oid)) AS \"total_size\" FROM pg_class c LEFT JOIN pg_namespace n ON (n.oid = c.relnamespace) WHERE nspname NOT IN ('pg_catalog', 'information_schema') AND c.relkind <> 'i' AND nspname !~ '^pg_toast' ORDER BY pg_total_relation_size(c.oid) DESC LIMIT 20;"
[ "${1}" = "clean_states" ] && {
clear_state_groups
exit 0
}
check_rooms
disable_ratelimit
check_api_rooms
[ "${1}" = "reset" ] && reset_state_compressor
compress_state
clean_db
print_stats
rm "${PIDFILE}"