Skip to content

Commit bbe4c8d

Browse files
Switch to Victoria Metrics
Added auto-restart to docker-compose.yml
1 parent 4653e65 commit bbe4c8d

File tree

3 files changed

+112
-60
lines changed

3 files changed

+112
-60
lines changed

README.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ Built for senior DBAs, SREs, and AI systems who need rapid root cause analysis a
1919
- **Dual-purpose architecture**: Built for both human experts and AI systems requiring structured performance data
2020
- **Comprehensive query analysis**: Complete `pg_stat_statements` metrics with historical trends and plan variations
2121
- **Active Session History**: Postgres's answer to Oracle ASH and AWS RDS Performance Insights
22-
- **Hybrid storage**: Prometheus for metrics, Postgres for query texts — best of both worlds
22+
- **Hybrid storage**: Victoria Metrics (Prometheus-compatible) for metrics, Postgres for query texts — best of both worlds
2323

2424
> 📖 **Read more**: [postgres_ai monitoring v0.7 announcement](https://postgres.ai/blog/20250722-postgres-ai-v0-7-expert-level-postgresql-monitoring) - detailed technical overview and architecture decisions.
2525
@@ -48,7 +48,7 @@ Experience the full monitoring solution: **https://demo.postgres.ai** (login: `d
4848
## 🏗️ Architecture
4949

5050
- **Collection**: pgwatch v3 (by Cybertec) for metrics gathering
51-
- **Storage**: Prometheus for time-series data + Postgres for query texts
51+
- **Storage**: Victoria Metrics for time-series data + Postgres for query texts
5252
- **Visualization**: Grafana with expert-designed dashboards
5353
- **Analysis**: Structured data output for AI system integration
5454

@@ -71,7 +71,7 @@ This monitoring solution exposes several ports that **MUST** be properly firewal
7171
- **Port 3000** (Grafana) - Contains sensitive database metrics and dashboards
7272
- **Port 58080** (PGWatch Postgres) - Database monitoring interface
7373
- **Port 58089** (PGWatch Prometheus) - Database monitoring interface
74-
- **Port 59090** (Prometheus) - Metrics storage and queries
74+
- **Port 59090** (Victoria Metrics) - Metrics storage and queries
7575
- **Port 59091** (PGWatch Prometheus endpoint) - Metrics collection
7676
- **Port 55000** (Flask API) - Backend API service
7777
- **Port 55432** (Demo DB) - When using `--demo` option
@@ -98,6 +98,8 @@ grant pg_monitor to postgres_ai_mon;
9898
grant select on pg_stat_statements to postgres_ai_mon;
9999
grant select on pg_stat_database to postgres_ai_mon;
100100
grant select on pg_stat_user_tables to postgres_ai_mon;
101+
grant select on pg_stat_user_indexes to postgres_ai_mon;
102+
grant select on pg_index to postgres_ai_mon;
101103

102104
-- Create a public view for pg_statistic access (required for bloat metrics on user schemas)
103105
create view public.pg_statistic as
@@ -195,7 +197,7 @@ After running quickstart:
195197
Technical URLs (for advanced users):
196198
- **Demo DB**: postgresql://postgres:postgres@localhost:55432/target_database
197199
- **Monitoring**: http://localhost:58080 (PGWatch)
198-
- **Metrics**: http://localhost:59090 (Prometheus)
200+
- **Metrics**: http://localhost:59090 (Victoria Metrics)
199201

200202
## 📖 Help
201203

config/pgwatch-prometheus/metrics.yml

Lines changed: 96 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -854,34 +854,32 @@ metrics:
854854
pg_stat_statements:
855855
sqls:
856856
11: |
857-
WITH ranked_statements as (
857+
WITH aggregated_statements as (
858858
select
859-
pg_get_userbyid(userid) as user,
860859
pg_database.datname,
861-
pg_stat_statements.queryid ,
862-
pg_stat_statements.plans as plans_total,
863-
pg_stat_statements.calls,
864-
pg_stat_statements.total_exec_time as exec_time_total,
865-
pg_stat_statements.total_plan_time as plan_time_total,
866-
pg_stat_statements.rows,
867-
(current_setting('block_size')::int * pg_stat_statements.shared_blks_hit) as shared_bytes_hit_total,
868-
(current_setting('block_size')::int * pg_stat_statements.shared_blks_read) as shared_bytes_read_total,
869-
(current_setting('block_size')::int * pg_stat_statements.shared_blks_dirtied) as shared_bytes_dirtied_total,
870-
(current_setting('block_size')::int * pg_stat_statements.shared_blks_written) as shared_bytes_written_total,
871-
pg_stat_statements.blk_read_time as block_read_total,
872-
pg_stat_statements.blk_write_time as block_write_total,
873-
pg_stat_statements.wal_records,
874-
pg_stat_statements.wal_fpi,
875-
pg_stat_statements.wal_bytes,
876-
(current_setting('block_size')::int * pg_stat_statements.temp_blks_read) as temp_bytes_read,
877-
(current_setting('block_size')::int * pg_stat_statements.temp_blks_written) as temp_bytes_written,
878-
row_number() over (order by total_exec_time desc) as rn
860+
pg_stat_statements.queryid,
861+
sum(pg_stat_statements.plans) as plans_total,
862+
sum(pg_stat_statements.calls) as calls,
863+
sum(pg_stat_statements.total_exec_time) as exec_time_total,
864+
sum(pg_stat_statements.total_plan_time) as plan_time_total,
865+
sum(pg_stat_statements.rows) as rows,
866+
sum(current_setting('block_size')::int * pg_stat_statements.shared_blks_hit) as shared_bytes_hit_total,
867+
sum(current_setting('block_size')::int * pg_stat_statements.shared_blks_read) as shared_bytes_read_total,
868+
sum(current_setting('block_size')::int * pg_stat_statements.shared_blks_dirtied) as shared_bytes_dirtied_total,
869+
sum(current_setting('block_size')::int * pg_stat_statements.shared_blks_written) as shared_bytes_written_total,
870+
sum(pg_stat_statements.blk_read_time) as block_read_total,
871+
sum(pg_stat_statements.blk_write_time) as block_write_total,
872+
sum(pg_stat_statements.wal_records) as wal_records,
873+
sum(pg_stat_statements.wal_fpi) as wal_fpi,
874+
sum(pg_stat_statements.wal_bytes) as wal_bytes,
875+
sum(current_setting('block_size')::int * pg_stat_statements.temp_blks_read) as temp_bytes_read,
876+
sum(current_setting('block_size')::int * pg_stat_statements.temp_blks_written) as temp_bytes_written
879877
from pg_stat_statements
880878
join pg_database
881879
on pg_database.oid = pg_stat_statements.dbid
880+
group by pg_database.datname, pg_stat_statements.queryid
882881
)
883882
select
884-
ranked_statements.user as tag_user,
885883
datname as tag_datname,
886884
queryid as tag_queryid,
887885
calls::int8 as calls,
@@ -900,36 +898,34 @@ metrics:
900898
wal_bytes::int8 as wal_bytes,
901899
temp_bytes_read::int8 as temp_bytes_read,
902900
temp_bytes_written::int8 as temp_bytes_written
903-
from ranked_statements
901+
from aggregated_statements
904902
17: |
905-
WITH ranked_statements as (
903+
WITH aggregated_statements as (
906904
select
907-
pg_get_userbyid(userid) as user,
908905
pg_database.datname,
909-
pg_stat_statements.queryid ,
910-
pg_stat_statements.plans as plans_total,
911-
pg_stat_statements.calls,
912-
pg_stat_statements.total_exec_time as exec_time_total,
913-
pg_stat_statements.total_plan_time as plan_time_total,
914-
pg_stat_statements.rows,
915-
(current_setting('block_size')::int * pg_stat_statements.shared_blks_hit) as shared_bytes_hit_total,
916-
(current_setting('block_size')::int * pg_stat_statements.shared_blks_read) as shared_bytes_read_total,
917-
(current_setting('block_size')::int * pg_stat_statements.shared_blks_dirtied) as shared_bytes_dirtied_total,
918-
(current_setting('block_size')::int * pg_stat_statements.shared_blks_written) as shared_bytes_written_total,
919-
pg_stat_statements.shared_blk_read_time as block_read_total,
920-
pg_stat_statements.shared_blk_write_time as block_write_total,
921-
pg_stat_statements.wal_records,
922-
pg_stat_statements.wal_fpi,
923-
pg_stat_statements.wal_bytes,
924-
(current_setting('block_size')::int * pg_stat_statements.temp_blks_read) as temp_bytes_read,
925-
(current_setting('block_size')::int * pg_stat_statements.temp_blks_written) as temp_bytes_written,
926-
row_number() over (order by total_exec_time desc) as rn
906+
pg_stat_statements.queryid,
907+
sum(pg_stat_statements.plans) as plans_total,
908+
sum(pg_stat_statements.calls) as calls,
909+
sum(pg_stat_statements.total_exec_time) as exec_time_total,
910+
sum(pg_stat_statements.total_plan_time) as plan_time_total,
911+
sum(pg_stat_statements.rows) as rows,
912+
sum(current_setting('block_size')::int * pg_stat_statements.shared_blks_hit) as shared_bytes_hit_total,
913+
sum(current_setting('block_size')::int * pg_stat_statements.shared_blks_read) as shared_bytes_read_total,
914+
sum(current_setting('block_size')::int * pg_stat_statements.shared_blks_dirtied) as shared_bytes_dirtied_total,
915+
sum(current_setting('block_size')::int * pg_stat_statements.shared_blks_written) as shared_bytes_written_total,
916+
sum(pg_stat_statements.shared_blk_read_time) as block_read_total,
917+
sum(pg_stat_statements.shared_blk_write_time) as block_write_total,
918+
sum(pg_stat_statements.wal_records) as wal_records,
919+
sum(pg_stat_statements.wal_fpi) as wal_fpi,
920+
sum(pg_stat_statements.wal_bytes) as wal_bytes,
921+
sum(current_setting('block_size')::int * pg_stat_statements.temp_blks_read) as temp_bytes_read,
922+
sum(current_setting('block_size')::int * pg_stat_statements.temp_blks_written) as temp_bytes_written
927923
from pg_stat_statements
928924
join pg_database
929925
on pg_database.oid = pg_stat_statements.dbid
926+
group by pg_database.datname, pg_stat_statements.queryid
930927
)
931928
select
932-
ranked_statements.user as tag_user,
933929
datname as tag_datname,
934930
queryid as tag_queryid,
935931
calls::int8 as calls,
@@ -948,7 +944,7 @@ metrics:
948944
wal_bytes::int8 as wal_bytes,
949945
temp_bytes_read::int8 as temp_bytes_read,
950946
temp_bytes_written::int8 as temp_bytes_written
951-
from ranked_statements
947+
from aggregated_statements
952948
gauges:
953949
- calls
954950
- plans_total
@@ -1650,7 +1646,8 @@ metrics:
16501646
pg_relation_size(i.indexrelid) as index_bytes,
16511647
ci.relpages,
16521648
(case when a.amname = 'btree' then true else false end) as idx_is_btree,
1653-
array_to_string(i.indclass, ', ') as opclasses
1649+
array_to_string(i.indclass, ', ') as opclasses,
1650+
si.stats_reset
16541651
from pg_index i
16551652
join pg_class ci on ci.oid = i.indexrelid and ci.relkind = 'i'
16561653
join pg_class cr on cr.oid = i.indrelid and cr.relkind = 'r'
@@ -1679,6 +1676,7 @@ metrics:
16791676
i.relpages,
16801677
idx_is_btree,
16811678
i.opclasses,
1679+
i.stats_reset,
16821680
(
16831681
select count(1)
16841682
from fk_indexes fi
@@ -1705,7 +1703,8 @@ metrics:
17051703
relpages,
17061704
idx_is_btree,
17071705
opclasses AS tag_opclasses,
1708-
supports_fk
1706+
supports_fk,
1707+
extract(epoch from stats_reset) as stats_reset_epoch
17091708
from index_ratios
17101709
where
17111710
idx_scan = 0
@@ -1858,6 +1857,54 @@ metrics:
18581857
gauges:
18591858
- '*'
18601859

1860+
stats_reset:
1861+
description: >
1862+
This metric tracks when statistics were last reset for indexes and tables.
1863+
It provides visibility into the freshness of statistics data, which is essential for understanding
1864+
the reliability of index and table usage metrics. A recent reset time indicates that usage statistics
1865+
may not reflect long-term patterns.
1866+
sqls:
1867+
11: |
1868+
with index_stats as (
1869+
select
1870+
schemaname as schema_name,
1871+
indexrelname as index_name,
1872+
relname as table_name,
1873+
'index' as stat_type,
1874+
stats_reset,
1875+
extract(epoch from stats_reset) as stats_reset_epoch,
1876+
extract(epoch from now() - stats_reset) as seconds_since_reset
1877+
from pg_stat_user_indexes
1878+
where stats_reset is not null
1879+
), table_stats as (
1880+
select
1881+
schemaname as schema_name,
1882+
null::text as index_name,
1883+
relname as table_name,
1884+
'table' as stat_type,
1885+
stats_reset,
1886+
extract(epoch from stats_reset) as stats_reset_epoch,
1887+
extract(epoch from now() - stats_reset) as seconds_since_reset
1888+
from pg_stat_user_tables
1889+
where stats_reset is not null
1890+
), combined_stats as (
1891+
select * from index_stats
1892+
union all
1893+
select * from table_stats
1894+
)
1895+
select
1896+
schema_name as tag_schema_name,
1897+
table_name as tag_table_name,
1898+
coalesce(index_name, 'table') as tag_index_name,
1899+
stat_type as tag_stat_type,
1900+
stats_reset_epoch,
1901+
seconds_since_reset
1902+
from combined_stats
1903+
order by stats_reset desc;
1904+
gauges:
1905+
- 'stats_reset_epoch'
1906+
- 'seconds_since_reset'
1907+
18611908
archive_lag:
18621909
description: >
18631910
This metric measures the lag in WAL archive processing.
@@ -2091,6 +2138,8 @@ metrics:
20912138
tidx_blks_hit
20922139
from
20932140
pg_statio_all_tables
2141+
order by pg_total_relation_size((schemaname || '.' || relname)::regclass) desc
2142+
limit 5000;
20942143
gauges:
20952144
- '*'
20962145

@@ -2133,6 +2182,7 @@ presets:
21332182
redundant_indexes: 10800
21342183
unused_indexes: 7200
21352184
rarely_used_indexes: 10800
2185+
stats_reset: 3600
21362186
archive_lag: 15
21372187
pg_vacuum_progress: 30
21382188
pg_index_pilot:

docker-compose.yml

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -54,22 +54,22 @@ services:
5454
- sink_postgres_data:/var/lib/postgresql/data
5555
- ./config/sink-postgres/init.sql:/docker-entrypoint-initdb.d/init.sql
5656

57-
# Prometheus Sink - Storage for metrics in Prometheus format
57+
# VictoriaMetrics Sink - Storage for metrics in Prometheus format
5858
sink-prometheus:
59-
image: prom/prometheus:v3.4.2
59+
image: victoriametrics/victoria-metrics:v1.105.0
6060
container_name: sink-prometheus
6161
ports:
6262
- "${BIND_HOST:-}59090:9090"
6363
volumes:
64+
- victoria_metrics_data:/victoria-metrics-data
6465
- ./config/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
65-
- prometheus_data:/prometheus
6666
command:
67-
- "--config.file=/etc/prometheus/prometheus.yml"
68-
- "--storage.tsdb.path=/prometheus"
69-
- "--web.console.libraries=/etc/prometheus/console_libraries"
70-
- "--web.console.templates=/etc/prometheus/consoles"
71-
- "--storage.tsdb.retention.time=200h"
72-
- "--web.enable-lifecycle"
67+
- "-storageDataPath=/victoria-metrics-data"
68+
- "-retentionPeriod=200h"
69+
- "-httpListenAddr=:9090"
70+
- "-promscrape.config=/etc/prometheus/prometheus.yml"
71+
- "-promscrape.config.strictParse=false"
72+
- "-promscrape.maxScrapeSize=128000000"
7373

7474
# PGWatch Instance 1 - Monitoring service (Postgres sink)
7575
pgwatch-postgres:
@@ -236,5 +236,5 @@ services:
236236
volumes:
237237
target_db_data:
238238
sink_postgres_data:
239-
prometheus_data:
239+
victoria_metrics_data:
240240
grafana_data:

0 commit comments

Comments
 (0)