rewrite most of matrixpurge script

2023-07-06
2 changed files with 283 additions and 87 deletions

# Purge old meia
# Clean up postgres synapse database
Shell script that cleans up the matrix postgresql database:
removes push notifications that aren't needed,
kicks out inactive users from rooms,
cleans up states with synapse_auto_compressor and so on.
Shell script that cleans up the matrix postgresql database:
- removes push notifications that aren't needed
- kicks out inactive users from rooms
- cleans up states with synapse_auto_compressor
- deletes messages that are older than 14 days from rooms
## Running
1. Set SQL credentials and URL to local matrix in the script
1. Set SQL credentials, URL and homeserver to local matrix in the script
2. build [synapse_auto_compressor]( and set its path in the script
3. make sure that the bridge did start at least once (it creates rooms and adds an admin user that we need)
4. add it as a cron job all 6h or so, like:
4. make sure that you do NOT have a [Message Retention Policy]( set, because this script does it for you, however, media_retention is still needed. This script will not delete any media.
5. add it as a cron job, like:
0 */6 * * * root /etc/matrix-synapse/
0 2,8,14,23 * * * root /etc/matrix-synapse/
12 11 * * 0 root /etc/matrix-synapse/ reset
The `"reset"` argument is for resetting the synapse_auto_compressor, it shouldn't be run often, but might come in hany if the compressor ends up in a weird state:
/etc/matrix-synapse/ reset
## Further resources
- [Shrink Synapse Database](
- [Message retention policies](
- [Purge History API](
- [Find unreferenced state groups](
- [matrix-synapse](
- [remove traces of rooms from the db](
## Useful commands
check currently active queries:
SELECT pid, query, NOW() - query_start AS elapsed FROM pg_stat_activity WHERE query != '<IDLE>';
check events of room sorted by time:
select content, type, received_ts from events where room_id = '!' and type = '' order by topological_ordering limit 100;
show tables by size:
SELECT nspname || '.' || relname AS "relation",
pg_size_pretty(pg_total_relation_size(c.oid)) AS "total_size"
FROM pg_class c
LEFT JOIN pg_namespace n ON (n.oid = c.relnamespace)
WHERE nspname NOT IN ('pg_catalog', 'information_schema')
AND c.relkind <> 'i'
AND nspname !~ '^pg_toast'
ORDER BY pg_total_relation_size(c.oid) DESC

# Do various clean-up tasks in matrix postgresql database
# run as cron job all 6h or so
# run with argument reset every month or every week
# URL to connect to matrix
# path to synapse_auto_compressor binary
# path to synapse_auto_compressor binary (you gotta download that)
# see
# prefix for bridge users and rooms (hardcoded in bridge)
# homeserver
# admin user of bridge channels
# (bridge creates him automatically, just make sure to run it at least once before running this script)
# path to synapse-find-unreferenced-state-groups
# see
# (not needed if you will never use the "clean_states" argument)
# ANSI color codes
R='\033[0;31m' #'0;31' is Red's ANSI color code
G='\033[0;32m' #'0;32' is Green's ANSI color code
Y='\033[1;32m' #'1;32' is Yellow's ANSI color code
B='\033[0;34m' #'0;34' is Blue's ANSI color code
L='\033[0;36m' #'0;34' is Blue's ANSI color code
cd /var/lib/postgresql
echo "--Get token for admin user"
TOKEN=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select token from access_tokens where user_id = '${ADMINID}' and device_id = 'SQLCLEANER' limit 1;" | xargs`
if [ -z ${TOKEN} ]
# get admin token for matrix-synapse, creating a new one if needed
get_admin_token () {
echo "--Get token for admin user"
TOKEN=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select token from access_tokens where user_id = '${ADMINID}' and device_id = 'SQLCLEANER' limit 1;" | xargs`
if [ -z ${TOKEN} ]; then
echo "Non exists, generating new Token..."
TOKEN=`cat /proc/sys/kernel/random/uuid`
TOKENID=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select max(id) + 1 from access_tokens"`
psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "insert into access_tokens(id, user_id, token, device_id, last_validated, used) values (${TOKENID}, '${ADMINID}', '${TOKEN}', 'SQLCLEANER', 1656788062940, 'f')"
[ "${1}" = "reset" ] && {
# reset state_auto_compressor
echo "-- Reset rust-synapse-compress-state"
psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "drop table state_compressor_state; drop table state_compressor_progress; drop table state_compressor_total_progress;"
# purge events in a room - just for reference - ignore this block
# TODO provide a function that purges a room and also its state_groups
#curl --insecure -XPOST -H "Authorization: Bearer ${TOKEN}" -H "Content-Type: application/json" -d "{\"delete_local_events\": true, \"purge_up_to_ts\": 1660340843343}" ${DELURL}
# and its status check
#curl --insecure -XGET -H "Authorization: Bearer ${TOKEN}" ${DELURL}
# Remember that if you delete a room, it's state groups are still around
# -----------------------------------------------------------------
# purge room
purge_room () {
D_IDRET=`curl --silent --max-time 900 --insecure -XDELETE -H "Authorization: Bearer ${TOKEN}" -H "Content-Type: application/json" -d "{\"block\": false, \"purge\": true}" "${MATRIXURL}/_synapse/admin/v2/rooms/${ROOM}"`
D_ID=`echo "${D_IDRET}" | jq -r '.delete_id'`
if [ "${D_ID}" = "null" ]; then
echo "${D_IDRET}" | jq -r '.error'
return 1
printf "Waiting for deletion ${D_ID} for ${ROOM}"
sleep 4
while [ "${D_STATUS}" = "purging" ]; do
D_STATUSRET=`curl --silent --max-time 900 --insecure -XGET -H "Authorization: Bearer ${TOKEN}" "${MATRIXURL}/_synapse/admin/v2/rooms/delete_status/${D_ID}"`
D_STATUS=`echo ${D_STATUSRET} | jq -r '.status'`
printf "."
sleep 5
printf "\nPurge finished ${D_STATUSRET}\n"
# Disable ratelimit
echo "--Disabling ratelimit for admin user ${ADMINID}..."
curl --insecure -XPOST -H "Authorization: Bearer ${TOKEN}" -H "Content-Type: application/json" -d "{\"messages_per_second\": 0}" ${RATEURL}
# purge events in a room - we do that here instead of per auto_retention,
# because it allows us to time it
purge_room_history () {
# ms timestamp of 30 days ago
#TS=$((`date +%s%3N` - 2592000000))
# ms timestamp of 14 days ago
TS=$((`date +%s%3N` - 1209600000))
P_IDRET=`curl --silent --max-time 900 --insecure -XPOST -H "Authorization: Bearer ${TOKEN}" -H "Content-Type: application/json" -d "{\"delete_local_events\": true, \"purge_up_to_ts\": ${TS}}" "${MATRIXURL}/_synapse/admin/v1/purge_history/${ROOM}"`
P_ID=`echo "${P_IDRET}" | jq -r '.purge_id'`
if [ "${P_ID}" = "null" ]; then
echo "${P_IDRET}" | jq -r '.error'
return 1
printf "Waiting for purge ${P_ID} for ${ROOM}"
sleep 4
while [ "${P_STATUS}" = "active" ]; do
P_STATUSRET=`curl --max-time 900 --silent --insecure -XGET -H "Authorization: Bearer ${TOKEN}" "${MATRIXURL}/_synapse/admin/v1/purge_history_status/${P_ID}"`
P_STATUS=`echo ${P_STATUSRET} | jq -r '.status'`
printf "."
sleep 5
printf "\nPurge finished with ${P_STATUSRET}\n"
# Remember that if you delete a room, it's state groups might still be around
ROOMS=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select room_id from room_aliases where room_alias like '#${PREFIX}_%'"`
# do stuff for each room on server
check_rooms () {
echo "-- Checking individual rooms"
while [ "${NEXT_BATCH}" != "null" ]; do
RET=`curl --max-time 900 --silent --insecure -XGET -H "Authorization: Bearer ${TOKEN}" "${MATRIXURL}/_synapse/admin/v1/rooms?order_by=joined_members&from=${NEXT_BATCH}"`
echo "$RET" | jq -c '.rooms[]' | while read room
amount=$(echo "$room" | jq -r .joined_members)
amount_local=$(echo "$room" | jq -r .joined_local_members)
id=$(echo "$room" | jq -r .room_id)
server=$(echo "$id" | sed -e 's/[^:]*://')
printf "=> ${L}${id} = ${G}${amount},${amount_local} ${R}${server}${NC}\n"
if [ "${server}" != "${HOMESERVER}" ] && [ "${amount_local}" = "0" ]; then
echo "Room ${id} is federated, but has no local users in it, purging..."
purge_room ${id}
elif [ "${amount}" = "0" ]; then
echo "Room ${id} has no members, purging..."
purge_room ${id}
elif [ ${amount} > 10 ]; then
#randomly decide if purge should happen or not
[ `date +%1N` -lt 4 ] && {
echo "Room ${id} with >10 users, magic decided to delete history of >14d ago..."
purge_room_history ${id}
NEXT_BATCH=`echo "$RET" | jq -r '.next_batch'`
# check unreferenced state groups for 20 largest rooms
check_state_groups () {
echo "--- Checking largest room state groups"
[ -f "${files}" ] && rm "${files}"
[ -f "/tmp/sgstmp.txt" ] && rm "/tmp/sgstmp.txt"
RET=`curl --max-time 900 --silent --insecure -XGET -H "Authorization: Bearer ${TOKEN}" "${MATRIXURL}/_synapse/admin/v1/rooms?order_by=joined_members&limit=20"`
echo "$RET" | jq -c '.rooms[]' | while read room
id=$(echo "$room" | jq -r .room_id)
printf "=> ${L}${id}${NC}\n"
nice -n 10 ${SYNAPSE_UNREFERENCED_STATES} -p postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -r "${id}" -o "/tmp/sgstmp.txt"
[ -f "/tmp/sgstmp.txt" ] && {
cat "/tmp/sgstmp.txt" >> "${files}"
rm "/tmp/sgstmp.txt"
# clear unreferenced state groups
clear_state_groups () {
echo "--Clearing unreferenced state groups"
check_state_groups "/tmp/sgs.txt"
sleep 10
check_state_groups "/tmp/sgs2.txt"
diff "/tmp/sgs.txt" "/tmp/sgs2.txt" > /dev/null || {
echo "State groups changed while checking, exiting..."
exit 1
rm "/tmp/sgs2.txt"
# can't run this while synapse is running
systemctl stop matrix-synapse
psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "CREATE TEMPORARY TABLE unreffed(id BIGINT PRIMARY KEY); COPY unreffed FROM stdin WITH (FORMAT 'csv'); DELETE FROM state_groups_state WHERE state_group IN (SELECT id FROM unreffed); DELETE FROM state_group_edges WHERE state_group IN (SELECT id FROM unreffed); DELETE FROM state_groups WHERE id IN (SELECT id FROM unreffed);" < /tmp/sgs.txt
rm "/tmp/sgs.txt"
psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "REINDEX (VERBOSE) DATABASE ${SQLDB}"
systemctl start matrix-synapse
# get all appsocket users from a room that did not send any message in the past 30 days,
# and print a curl config to kick them
get_curl_config () {
# Get all appsockets users from a public room that did not send any message in the past 48h
ASUSERS=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select user_id from users_in_public_rooms u where room_id = '${room}' and user_id like '@${PREFIX}' and user_id != '${ADMINID}' and not exists ( select from events where room_id = '${room}' and sender = u.user_id and to_timestamp(received_ts/1000) > now() - interval '4 DAYS' ) and not exists ( select from user_ips where user_id = u.user_id )"`
ASUSERS=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select user_id from users_in_public_rooms u where room_id = '${room}' and user_id like '@${PREFIX}' and user_id != '${ADMINID}' and not exists ( select from events where room_id = '${room}' and sender = u.user_id and to_timestamp(received_ts/1000) > now() - interval '30 DAYS' ) and not exists ( select from user_ips where user_id = u.user_id )"`
for user in ${ASUSERS}
if [ ${CNT} -eq 1 ]
for user in ${ASUSERS}; do
if [ ${CNT} -eq 1 ]; then
echo "next"
return ${CNT}
for room in ${ROOMS}
echo "--Delete event_push_actions of not logged-in users from romm ${room}..."
# Do stuff for appsocket rooms
check_api_rooms () {
ROOMS=`psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "select room_id from room_aliases where room_alias like '#${PREFIX}_%'"`
for room in ${ROOMS}
echo "=> Delete event_push_actions of not logged-in users from romm ${room} and kick out inactive users..."
# Clean event_push_actions of not-logged-in application service users
# see
# This command can be really slow, if it takes too long, remove the "not exists..." part and it won't care about if logged in or not
time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "delete from event_push_actions u where room_id = '${room}' and user_id like '@${PREFIX}' and not exists ( select from user_ips where user_id = u.user_id and to_timestamp(last_seen/1000) > now() - interval '2 days' )"
echo "--Kick out inactive users from room ${room}..."
get_curl_config "${room}" > /tmp/curlkick.tmp && curl --parallel --parallel-immediate --parallel-max 10 --config /tmp/curlkick.tmp && echo ""
psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "delete from event_push_actions u where room_id = '${room}' and user_id like '@${PREFIX}' and not exists ( select from user_ips where user_id = u.user_id and to_timestamp(last_seen/1000) > now() - interval '2 days' )"
get_curl_config "${room}" > /tmp/curlkick.tmp && curl --silent --parallel --parallel-immediate --parallel-max 10 --config /tmp/curlkick.tmp && echo ""
rm /tmp/curlkick.tmp
echo "--Clean up cache_invalidation_stream_by_instance"
# see
time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "delete from cache_invalidation_stream_by_instance where to_timestamp(invalidation_ts/1000) > now() - interval '1 months';"
echo "--Compress states..."
${SYNAPSE_COMPRESSOR_PATH} -p postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c 500 -n 100
echo "--Vaccum..."
time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "VACUUM FULL VERBOSE"
echo "--DONE. Current database size is..."
time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "SELECT pg_size_pretty( pg_database_size( 'synapse' ) )"
# disable ratelimit on admin API
disable_ratelimit () {
echo "--Disabling ratelimit for admin user ${ADMINID}..."
curl --silent --insecure -XPOST -H "Authorization: Bearer ${TOKEN}" -H "Content-Type: application/json" -d "{\"messages_per_second\": 0}" ${RATEURL}
[ -n "${BASH_VERSION}" ] && set +H
psql postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "SELECT nspname || '.' || relname AS \"relation\", pg_size_pretty(pg_total_relation_size(c.oid)) AS \"total_size\" FROM pg_class c LEFT JOIN pg_namespace n ON (n.oid = c.relnamespace) WHERE nspname NOT IN ('pg_catalog', 'information_schema') AND c.relkind <> 'i' AND nspname !~ '^pg_toast' ORDER BY pg_total_relation_size(c.oid) DESC LIMIT 20;"
# reset state compressor if requested
reset_state_compressor () {
echo "--Reset rust-synapse-compress-state"
psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "drop table state_compressor_state; drop table state_compressor_progress; drop table state_compressor_total_progress;"
compress_state () {
echo "--Compress states..."
nice -n 10 ${SYNAPSE_COMPRESSOR_PATH} -p postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c 500 -n 100
clean_db () {
echo "--Clean up cache_invalidation_stream_by_instance"
# see
time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "delete from cache_invalidation_stream_by_instance where to_timestamp(invalidation_ts/1000) > now() - interval '1 months';"
echo "--Vaccum..."
time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "VACUUM FULL"
print_stats () {
echo "--DONE. Current database size is..."
time psql -t postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "SELECT pg_size_pretty( pg_database_size( 'synapse' ) )"
[ -n "${BASH_VERSION}" ] && set +H
psql postgresql://${SQLUSER}:${SQLPASSWD}@localhost/${SQLDB} -c "SELECT nspname || '.' || relname AS \"relation\", pg_size_pretty(pg_total_relation_size(c.oid)) AS \"total_size\" FROM pg_class c LEFT JOIN pg_namespace n ON (n.oid = c.relnamespace) WHERE nspname NOT IN ('pg_catalog', 'information_schema') AND c.relkind <> 'i' AND nspname !~ '^pg_toast' ORDER BY pg_total_relation_size(c.oid) DESC LIMIT 20;"
[ -f "${PIDFILE}" ] && ps -p `cat "${PIDFILE}"` > /dev/null && {
echo " already running, exiting."
exit 1
echo $$ > "${PIDFILE}"
[ "${1}" = "clean_states" ] && {
exit 0
[ "${1}" = "reset" ] && reset_state_compressor
rm "${PIDFILE}"