Compare commits

...

19 commits

Author SHA1 Message Date
Jacob Taylor
1b4890fd26 exponential backoff is now just bees. did you want bees? no? well you have them now. congrats 2025-08-14 20:23:40 -07:00
Jacob Taylor
a81822165d fix too many infos 2025-08-14 20:23:40 -07:00
Jacob Taylor
b2050836a9 more funny settings (part 3 of 12) 2025-08-14 20:23:40 -07:00
Jacob Taylor
dd504aba4e sender_workers scaling. this time, with feeling! 2025-08-14 20:23:40 -07:00
Jacob Taylor
5cf175c392 vehicle loan documentation now available at window 7 2025-08-14 20:23:40 -07:00
Jacob Taylor
ff2e41f81e lock the getter instead ??? c/o M 2025-08-14 20:23:40 -07:00
Jacob Taylor
54008dcd5e make fetching key room events less smart 2025-08-14 20:23:40 -07:00
Jacob Taylor
564d3f179a change rocksdb stats level to 3
scale rocksdb background jobs and subcompactions

change rocksdb default error level to info from error

delete unused num_threads function

fix warns from cargo
2025-08-14 20:23:40 -07:00
nexy7574
6b5e4fab28 log which room struggled to get mainline depth 2025-08-14 20:23:40 -07:00
nexy7574
8046f69ed4 more logs 2025-08-14 20:23:40 -07:00
nexy7574
82f4521e65 Fix room ID check 2025-08-14 20:23:40 -07:00
nexy7574
113ab10e54 Kick up a fuss when m.room.create is unfindable 2025-08-14 20:23:40 -07:00
nexy7574
3728fe572f Note about ruma#2064 in TODO 2025-08-14 20:23:40 -07:00
nexy7574
9fea7c4899 fix an auth rule not applying correctly 2025-08-14 20:23:40 -07:00
Jacob Taylor
6adead0da7 upgrade some settings to enable 5g in continuwuity
enable converged 6g at the edge in continuwuity

better stateinfo_cache_capacity default

better roomid_spacehierarchy_cache_capacity

make sender workers default better and clamp value to core count

update sender workers documentation

add more parallelism_scaled and make them public

update 1 document
2025-08-14 20:23:40 -07:00
Jacob Taylor
53bf4ac512 bump the number of allowed immutable memtables by 1, to allow for greater flood protection
this should probably not be applied if you have rocksdb_atomic_flush = false (the default)
2025-08-14 20:23:40 -07:00
Jacob Taylor
69e47b7fb7 probably incorrectly delete support for non-standardized matrix srv record 2025-08-14 20:23:40 -07:00
nexy7574
255aa44ecc
fix(fed): Alter log levels to be less noisy 2025-08-15 04:20:03 +01:00
nexy7574
04130dcdd8
fix(fed): Improve transaction flushing 2025-08-15 04:11:54 +01:00
18 changed files with 139 additions and 112 deletions

View file

@ -1066,7 +1066,7 @@
# 3 to 5 = Statistics with possible performance impact.
# 6 = All statistics.
#
#rocksdb_stats_level = 1
#rocksdb_stats_level = 3
# This is a password that can be configured that will let you login to the
# server bot account (currently `@conduit`) for emergency troubleshooting
@ -1680,11 +1680,9 @@
#stream_amplification = 1024
# Number of sender task workers; determines sender parallelism. Default is
# '0' which means the value is determined internally, likely matching the
# number of tokio worker-threads or number of cores, etc. Override by
# setting a non-zero value.
# number of CPU cores. Override by setting a different value.
#
#sender_workers = 0
#sender_workers = 4
# Enables listener sockets; can be set to false to disable listening. This
# option is intended for developer/diagnostic purposes only.

View file

@ -1235,7 +1235,7 @@ pub struct Config {
/// 3 to 5 = Statistics with possible performance impact.
/// 6 = All statistics.
///
/// default: 1
/// default: 3
#[serde(default = "default_rocksdb_stats_level")]
pub rocksdb_stats_level: u8,
@ -1917,12 +1917,10 @@ pub struct Config {
pub stream_amplification: usize,
/// Number of sender task workers; determines sender parallelism. Default is
/// '0' which means the value is determined internally, likely matching the
/// number of tokio worker-threads or number of cores, etc. Override by
/// setting a non-zero value.
/// core count. Override by setting a different value.
///
/// default: 0
#[serde(default)]
/// default: core count
#[serde(default = "default_sender_workers")]
pub sender_workers: usize,
/// Enables listener sockets; can be set to false to disable listening. This
@ -2153,45 +2151,48 @@ fn default_database_backups_to_keep() -> i16 { 1 }
fn default_db_write_buffer_capacity_mb() -> f64 { 48.0 + parallelism_scaled_f64(4.0) }
fn default_db_cache_capacity_mb() -> f64 { 128.0 + parallelism_scaled_f64(64.0) }
fn default_db_cache_capacity_mb() -> f64 { 512.0 + parallelism_scaled_f64(512.0) }
fn default_pdu_cache_capacity() -> u32 { parallelism_scaled_u32(10_000).saturating_add(100_000) }
fn default_pdu_cache_capacity() -> u32 { parallelism_scaled_u32(50_000).saturating_add(100_000) }
fn default_cache_capacity_modifier() -> f64 { 1.0 }
fn default_auth_chain_cache_capacity() -> u32 {
parallelism_scaled_u32(10_000).saturating_add(100_000)
}
fn default_shorteventid_cache_capacity() -> u32 {
parallelism_scaled_u32(50_000).saturating_add(100_000)
}
fn default_shorteventid_cache_capacity() -> u32 {
parallelism_scaled_u32(100_000).saturating_add(100_000)
}
fn default_eventidshort_cache_capacity() -> u32 {
parallelism_scaled_u32(25_000).saturating_add(100_000)
parallelism_scaled_u32(50_000).saturating_add(100_000)
}
fn default_eventid_pdu_cache_capacity() -> u32 {
parallelism_scaled_u32(25_000).saturating_add(100_000)
parallelism_scaled_u32(50_000).saturating_add(100_000)
}
fn default_shortstatekey_cache_capacity() -> u32 {
parallelism_scaled_u32(10_000).saturating_add(100_000)
parallelism_scaled_u32(50_000).saturating_add(100_000)
}
fn default_statekeyshort_cache_capacity() -> u32 {
parallelism_scaled_u32(10_000).saturating_add(100_000)
parallelism_scaled_u32(50_000).saturating_add(100_000)
}
fn default_servernameevent_data_cache_capacity() -> u32 {
parallelism_scaled_u32(100_000).saturating_add(500_000)
parallelism_scaled_u32(100_000).saturating_add(100_000)
}
fn default_stateinfo_cache_capacity() -> u32 { parallelism_scaled_u32(100) }
fn default_stateinfo_cache_capacity() -> u32 {
parallelism_scaled_u32(500).clamp(100, 12000)
}
fn default_roomid_spacehierarchy_cache_capacity() -> u32 { parallelism_scaled_u32(1000) }
fn default_roomid_spacehierarchy_cache_capacity() -> u32 {
parallelism_scaled_u32(500).clamp(100, 12000) }
fn default_dns_cache_entries() -> u32 { 32768 }
fn default_dns_cache_entries() -> u32 { 327680 }
fn default_dns_min_ttl() -> u64 { 60 * 180 }
@ -2297,7 +2298,7 @@ fn default_typing_client_timeout_max_s() -> u64 { 45 }
fn default_rocksdb_recovery_mode() -> u8 { 1 }
fn default_rocksdb_log_level() -> String { "error".to_owned() }
fn default_rocksdb_log_level() -> String { "info".to_owned() }
fn default_rocksdb_log_time_to_roll() -> usize { 0 }
@ -2329,7 +2330,7 @@ fn default_rocksdb_compression_level() -> i32 { 32767 }
#[allow(clippy::doc_markdown)]
fn default_rocksdb_bottommost_compression_level() -> i32 { 32767 }
fn default_rocksdb_stats_level() -> u8 { 1 }
fn default_rocksdb_stats_level() -> u8 { 3 }
// I know, it's a great name
#[must_use]
@ -2384,14 +2385,13 @@ fn default_admin_log_capture() -> String {
fn default_admin_room_tag() -> String { "m.server_notice".to_owned() }
#[allow(clippy::as_conversions, clippy::cast_precision_loss)]
fn parallelism_scaled_f64(val: f64) -> f64 { val * (sys::available_parallelism() as f64) }
pub fn parallelism_scaled_f64(val: f64) -> f64 { val * (sys::available_parallelism() as f64) }
fn parallelism_scaled_u32(val: u32) -> u32 {
let val = val.try_into().expect("failed to cast u32 to usize");
parallelism_scaled(val).try_into().unwrap_or(u32::MAX)
}
pub fn parallelism_scaled_u32(val: u32) -> u32 { val.saturating_mul(sys::available_parallelism() as u32) }
fn parallelism_scaled(val: usize) -> usize { val.saturating_mul(sys::available_parallelism()) }
pub fn parallelism_scaled_i32(val: i32) -> i32 { val.saturating_mul(sys::available_parallelism() as i32) }
pub fn parallelism_scaled(val: usize) -> usize { val.saturating_mul(sys::available_parallelism()) }
fn default_trusted_server_batch_size() -> usize { 256 }
@ -2411,6 +2411,8 @@ fn default_stream_width_scale() -> f32 { 1.0 }
fn default_stream_amplification() -> usize { 1024 }
fn default_sender_workers() -> usize { parallelism_scaled(1) }
fn default_client_receive_timeout() -> u64 { 75 }
fn default_client_request_timeout() -> u64 { 180 }

View file

@ -13,6 +13,7 @@ use ruma::{
power_levels::RoomPowerLevelsEventContent,
third_party_invite::RoomThirdPartyInviteEventContent,
},
EventId,
int,
serde::{Base64, Raw},
};
@ -21,7 +22,6 @@ use serde::{
de::{Error as _, IgnoredAny},
};
use serde_json::{from_str as from_json_str, value::RawValue as RawJsonValue};
use super::{
Error, Event, Result, StateEventType, StateKey, TimelineEventType,
power_levels::{
@ -149,8 +149,8 @@ where
for<'a> &'a E: Event + Send,
{
debug!(
event_id = %incoming_event.event_id(),
event_type = ?incoming_event.event_type(),
event_id = format!("{}", incoming_event.event_id()),
event_type = format!("{}", incoming_event.event_type()),
"auth_check beginning"
);
@ -219,7 +219,7 @@ where
/*
// TODO: In the past this code was commented as it caused problems with Synapse. This is no
// longer the case. This needs to be implemented.
// See also: https://github.com/ruma/ruma/pull/2064
// See also: https://github.com/ruma/ruma/pull/2064
//
// 2. Reject if auth_events
// a. auth_events cannot have duplicate keys since it's a BTree
@ -242,20 +242,44 @@ where
}
*/
let (room_create_event, power_levels_event, sender_member_event) = join3(
fetch_state(&StateEventType::RoomCreate, ""),
fetch_state(&StateEventType::RoomPowerLevels, ""),
fetch_state(&StateEventType::RoomMember, sender.as_str()),
)
.await;
// let (room_create_event, power_levels_event, sender_member_event) = join3(
// fetch_state(&StateEventType::RoomCreate, ""),
// fetch_state(&StateEventType::RoomPowerLevels, ""),
// fetch_state(&StateEventType::RoomMember, sender.as_str()),
// )
// .await;
let room_create_event = fetch_state(&StateEventType::RoomCreate, "").await;
let power_levels_event = fetch_state(&StateEventType::RoomPowerLevels, "").await;
let sender_member_event = fetch_state(&StateEventType::RoomMember, sender.as_str()).await;
let room_create_event = match room_create_event {
| None => {
warn!("no m.room.create event in auth chain");
error!(
create_event = room_create_event.as_ref().map(Event::event_id).unwrap_or(<&EventId>::try_from("$unknown").unwrap()).as_str(),
power_levels = power_levels_event.as_ref().map(Event::event_id).unwrap_or(<&EventId>::try_from("$unknown").unwrap()).as_str(),
member_event = sender_member_event.as_ref().map(Event::event_id).unwrap_or(<&EventId>::try_from("$unknown").unwrap()).as_str(),
"no m.room.create event found for {} ({})!",
incoming_event.event_id().as_str(),
incoming_event.room_id().as_str()
);
return Ok(false);
},
| Some(e) => e,
};
// just re-check 1.2 to work around a bug
let Some(room_id_server_name) = incoming_event.room_id().server_name() else {
warn!("room ID has no servername");
return Ok(false);
};
if room_id_server_name != room_create_event.sender().server_name() {
warn!(
"servername of room ID origin ({}) does not match servername of m.room.create sender ({})",
room_id_server_name,
room_create_event.sender().server_name());
return Ok(false);
}
if incoming_event.room_id() != room_create_event.room_id() {
warn!("room_id of incoming event does not match room_id of m.room.create event");

View file

@ -733,8 +733,12 @@ where
Fut: Future<Output = Option<E>> + Send,
E: Event + Send + Sync,
{
let mut room_id = None;
while let Some(sort_ev) = event {
debug!(event_id = sort_ev.event_id().as_str(), "mainline");
if room_id.is_none() {
room_id = Some(sort_ev.room_id().to_owned());
}
let id = sort_ev.event_id();
if let Some(depth) = mainline_map.get(id) {
@ -753,7 +757,7 @@ where
}
}
}
// Did not find a power level event so we default to zero
warn!("could not find a power event in the mainline map for {room_id:?}, defaulting to zero depth");
Ok(0)
}

View file

@ -29,7 +29,7 @@ fn descriptor_cf_options(
set_table_options(&mut opts, &desc, cache)?;
opts.set_min_write_buffer_number(1);
opts.set_max_write_buffer_number(2);
opts.set_max_write_buffer_number(3);
opts.set_write_buffer_size(desc.write_size);
opts.set_target_file_size_base(desc.file_size);

View file

@ -1,8 +1,6 @@
use std::{cmp, convert::TryFrom};
use conduwuit::{Config, Result, utils};
use conduwuit::{Config, Result};
use rocksdb::{Cache, DBRecoveryMode, Env, LogLevel, Options, statistics::StatsLevel};
use conduwuit::config::{parallelism_scaled_i32, parallelism_scaled_u32};
use super::{cf_opts::cache_size_f64, logger::handle as handle_log};
/// Create database-wide options suitable for opening the database. This also
@ -23,8 +21,8 @@ pub(crate) fn db_options(config: &Config, env: &Env, row_cache: &Cache) -> Resul
set_logging_defaults(&mut opts, config);
// Processing
opts.set_max_background_jobs(num_threads::<i32>(config)?);
opts.set_max_subcompactions(num_threads::<u32>(config)?);
opts.set_max_background_jobs(parallelism_scaled_i32(1));
opts.set_max_subcompactions(parallelism_scaled_u32(1));
opts.set_avoid_unnecessary_blocking_io(true);
opts.set_max_file_opening_threads(0);
@ -126,15 +124,3 @@ fn set_logging_defaults(opts: &mut Options, config: &Config) {
opts.set_callback_logger(rocksdb_log_level, &handle_log);
}
}
fn num_threads<T: TryFrom<usize>>(config: &Config) -> Result<T> {
const MIN_PARALLELISM: usize = 2;
let requested = if config.rocksdb_parallelism_threads != 0 {
config.rocksdb_parallelism_threads
} else {
utils::available_parallelism()
};
utils::math::try_into::<T, usize>(cmp::max(MIN_PARALLELISM, requested))
}

View file

@ -3,7 +3,7 @@ use std::{fmt::Debug, mem};
use bytes::Bytes;
use conduwuit::{
Err, Error, Result, debug, debug::INFO_SPAN_LEVEL, debug_error, debug_warn, err,
error::inspect_debug_log, implement, trace, utils::string::EMPTY,
error::inspect_debug_log, implement, trace, utils::string::EMPTY, warn,
};
use http::{HeaderValue, header::AUTHORIZATION};
use ipaddress::IPAddress;
@ -193,7 +193,7 @@ fn handle_error(
) -> Result {
if e.is_timeout() || e.is_connect() {
e = e.without_url();
debug_warn!("{e:?}");
debug_warn!(?url, "network error while sending request: {e:?}");
} else if e.is_redirect() {
debug_error!(
method = ?method,
@ -204,7 +204,7 @@ fn handle_error(
e,
);
} else {
debug_error!("{e:?}");
warn!(?url, "failed to send federation request: {e:?}");
}
Err(e.into())

View file

@ -100,7 +100,7 @@ impl Service {
/// Pings the presence of the given user in the given room, setting the
/// specified state.
pub async fn ping_presence(&self, user_id: &UserId, new_state: &PresenceState) -> Result<()> {
const REFRESH_TIMEOUT: u64 = 60 * 1000;
const REFRESH_TIMEOUT: u64 = 60 * 1000 * 4;
let last_presence = self.db.get_presence(user_id).await;
let state_changed = match last_presence {

View file

@ -306,28 +306,25 @@ impl super::Service {
#[tracing::instrument(name = "srv", level = "debug", skip(self))]
async fn query_srv_record(&self, hostname: &'_ str) -> Result<Option<FedDest>> {
let hostnames =
[format!("_matrix-fed._tcp.{hostname}."), format!("_matrix._tcp.{hostname}.")];
self.services.server.check_running()?;
for hostname in hostnames {
self.services.server.check_running()?;
debug!("querying SRV for {hostname:?}");
debug!("querying SRV for {hostname:?}");
let hostname = hostname.trim_end_matches('.');
match self.resolver.resolver.srv_lookup(hostname).await {
| Err(e) => Self::handle_resolve_error(&e, hostname)?,
| Ok(result) => {
return Ok(result.iter().next().map(|result| {
FedDest::Named(
result.target().to_string().trim_end_matches('.').to_owned(),
format!(":{}", result.port())
.as_str()
.try_into()
.unwrap_or_else(|_| FedDest::default_port()),
)
}));
},
}
let hostname_suffix = format!("_matrix-fed._tcp.{hostname}.");
let hostname = hostname_suffix.trim_end_matches('.');
match self.resolver.resolver.srv_lookup(hostname).await {
| Err(e) => Self::handle_resolve_error(&e, hostname)?,
| Ok(result) => {
return Ok(result.iter().next().map(|result| {
FedDest::Named(
result.target().to_string().trim_end_matches('.').to_owned(),
format!(":{}", result.port())
.as_str()
.try_into()
.unwrap_or_else(|_| FedDest::default_port()),
)
}));
},
}
Ok(None)

View file

@ -53,9 +53,9 @@ impl Resolver {
opts.cache_size = config.dns_cache_entries as usize;
opts.preserve_intermediates = true;
opts.negative_min_ttl = Some(Duration::from_secs(config.dns_min_ttl_nxdomain));
opts.negative_max_ttl = Some(Duration::from_secs(60 * 60 * 24 * 30));
opts.negative_max_ttl = Some(Duration::from_secs(60 * 60 * 24));
opts.positive_min_ttl = Some(Duration::from_secs(config.dns_min_ttl));
opts.positive_max_ttl = Some(Duration::from_secs(60 * 60 * 24 * 7));
opts.positive_max_ttl = Some(Duration::from_secs(60 * 60 * 24));
opts.timeout = Duration::from_secs(config.dns_timeout);
opts.attempts = config.dns_attempts as usize;
opts.try_tcp_on_error = config.dns_tcp_fallback;

View file

@ -79,7 +79,7 @@ where
{
// Exponential backoff
const MIN_DURATION: u64 = 60 * 2;
const MAX_DURATION: u64 = 60 * 60 * 8;
const MAX_DURATION: u64 = 60 * 60;
if continue_exponential_backoff_secs(
MIN_DURATION,
MAX_DURATION,

View file

@ -122,10 +122,7 @@ where
}
// The original create event must be in the auth events
if !matches!(
auth_events.get(&(StateEventType::RoomCreate, String::new().into())),
Some(_) | None
) {
if !auth_events.contains_key(&(StateEventType::RoomCreate, String::new().into())) {
return Err!(Request(InvalidParam("Incoming event refers to wrong create event.")));
}

View file

@ -46,7 +46,7 @@ where
{
// Exponential backoff
const MIN_DURATION: u64 = 5 * 60;
const MAX_DURATION: u64 = 60 * 60 * 24;
const MAX_DURATION: u64 = 60 * 60;
if continue_exponential_backoff_secs(MIN_DURATION, MAX_DURATION, time.elapsed(), *tries) {
debug!(
?tries,

View file

@ -5,7 +5,7 @@ use conduwuit::{
matrix::{Event, EventTypeExt, PduEvent, StateKey, state_res},
trace,
utils::stream::{BroadbandExt, ReadyExt},
warn,
warn
};
use futures::{FutureExt, StreamExt, future::ready};
use ruma::{CanonicalJsonValue, RoomId, ServerName, events::StateEventType};
@ -175,7 +175,7 @@ where
let extremities: Vec<_> = self
.services
.state
.get_forward_extremities(room_id)
.get_forward_extremities(room_id, &state_lock)
.map(ToOwned::to_owned)
.ready_filter(|event_id| {
// Remove any that are referenced by this incoming event's prev_events
@ -193,6 +193,8 @@ where
.collect()
.await;
if extremities.len() == 0 { info!("Retained zero extremities when upgrading outlier PDU to timeline PDU with {} previous events, event id: {}", incoming_pdu.prev_events.len(), incoming_pdu.event_id) }
debug!(
"Retained {} extremities checked against {} prev_events",
extremities.len(),

View file

@ -388,6 +388,7 @@ impl Service {
pub fn get_forward_extremities<'a>(
&'a self,
room_id: &'a RoomId,
_state_lock: &'a RoomMutexGuard,
) -> impl Stream<Item = &EventId> + Send + '_ {
let prefix = (room_id, Interfix);

View file

@ -42,7 +42,7 @@ pub async fn create_hash_and_sign_event(
let prev_events: Vec<OwnedEventId> = self
.services
.state
.get_forward_extremities(room_id)
.get_forward_extremities(room_id, _mutex_lock)
.take(20)
.map(Into::into)
.collect()

View file

@ -401,16 +401,10 @@ impl Service {
fn num_senders(args: &crate::Args<'_>) -> usize {
const MIN_SENDERS: usize = 1;
// Limit the number of senders to the number of workers threads or number of
// cores, conservatively.
let max_senders = args
.server
.metrics
.num_workers()
.min(available_parallelism());
// Limit the maximum number of senders to the number of cores.
let max_senders = available_parallelism();
// If the user doesn't override the default 0, this is intended to then default
// to 1 for now as multiple senders is experimental.
// default is 4 senders. clamp between 1 and core count.
args.server
.config
.sender_workers

View file

@ -10,7 +10,7 @@ use std::{
use base64::{Engine as _, engine::general_purpose::URL_SAFE_NO_PAD};
use conduwuit_core::{
Error, Event, Result, debug, err, error,
Error, Event, Result, debug, err, error, info,
result::LogErr,
trace,
utils::{
@ -142,7 +142,7 @@ impl Service {
}
fn handle_response_err(dest: Destination, statuses: &mut CurTransactionStatus, e: &Error) {
debug!(dest = ?dest, "{e:?}");
debug!(dest = ?dest, "error response: {e:?}");
statuses.entry(dest).and_modify(|e| {
*e = match e {
| TransactionStatus::Running => TransactionStatus::Failed(1, Instant::now()),
@ -177,7 +177,21 @@ impl Service {
if !new_events.is_empty() {
self.db.mark_as_active(new_events.iter());
let new_events_vec = new_events.into_iter().map(|(_, event)| event).collect();
let new_events_vec: Vec<SendingEvent> =
new_events.into_iter().map(|(_, event)| event).collect();
if let Some(status) = statuses.get(&dest.clone()) {
if matches!(status, TransactionStatus::Running) {
// If the server is in backoff, clear it
info!(
?dest,
"Catching up previously failed destination with {}+ new events",
new_events_vec.len()
);
statuses.insert(dest.clone(), TransactionStatus::Running);
}
}
futures.push(self.send_events(dest.clone(), new_events_vec));
} else {
statuses.remove(dest);
@ -859,12 +873,20 @@ impl Service {
pdus,
edus,
};
let pdu_count = request.pdus.len();
let edu_count = request.edus.len();
let result = self
.services
.federation
.execute_on(&self.services.client.sender, &server, request)
.await;
.await
.inspect(|_| {
info!(%txn_id, %server, "Sent {} PDUs, {} EDUs", pdu_count, edu_count);
})
.inspect_err(|e| {
info!(%txn_id, %server, "Failed to send transaction ({} PDUs, {} EDUs): {e:?}", pdu_count, edu_count);
});
for (event_id, result) in result.iter().flat_map(|resp| resp.pdus.iter()) {
if let Err(e) = result {