/* * Odyssey. * * Scalable PostgreSQL connection pooler. */ #include #include #include #include #include void od_backend_close(od_server_t *server) { assert(server->route == NULL); assert(server->io.io == NULL); assert(server->tls == NULL); server->is_transaction = 0; server->idle_time = 0; kiwi_key_init(&server->key); kiwi_key_init(&server->key_client); od_server_free(server); } static inline int od_backend_terminate(od_server_t *server) { machine_msg_t *msg; msg = kiwi_fe_write_terminate(NULL); if (msg == NULL) return -1; return od_write(&server->io, msg); } void od_backend_close_connection(od_server_t *server) { /* failed to connect to endpoint, so notring to do */ if (server->io.io == NULL) { return; } if (machine_connected(server->io.io)) od_backend_terminate(server); od_io_close(&server->io); if (server->error_connect) { machine_msg_free(server->error_connect); server->error_connect = NULL; } if (server->tls) { machine_tls_free(server->tls); server->tls = NULL; } } void od_backend_error(od_server_t *server, char *context, char *data, uint32_t size) { od_instance_t *instance = server->global->instance; kiwi_fe_error_t error; int rc; rc = kiwi_fe_read_error(data, size, &error); if (rc == -1) { od_error(&instance->logger, context, server->client, server, "failed to parse error message from server"); return; } od_error(&instance->logger, context, server->client, server, "%s %s %s", error.severity, error.code, error.message); if (error.detail) { od_error(&instance->logger, context, server->client, server, "DETAIL: %s", error.detail); } if (error.hint) { od_error(&instance->logger, context, server->client, server, "HINT: %s", error.hint); } } int od_backend_ready(od_server_t *server, char *data, uint32_t size) { int status; int rc; rc = kiwi_fe_read_ready(data, size, &status); if (rc == -1) return -1; if (status == 'I') { /* no active transaction */ server->is_transaction = 0; } else if (status == 'T' || status == 'E') { /* in active transaction or in interrupted * transaction block */ server->is_transaction = 1; } /* update server sync reply state */ od_server_sync_reply(server); return 0; } static inline int od_backend_startup(od_server_t *server, kiwi_params_t *route_params, od_client_t *client) { od_instance_t *instance = server->global->instance; od_route_t *route = server->route; #define DEFAULT_ARGV_SIZE 6 kiwi_fe_arg_t argv[DEFAULT_ARGV_SIZE + 2 * route->rule->backend_startup_vars_sz]; kiwi_fe_arg_t default_argv[] = { { "user", 5 }, { route->id.user, route->id.user_len }, { "database", 9 }, { route->id.database, route->id.database_len }, { "replication", 12 }, { NULL, 0 } }; od_debug(&instance->logger, "startup", NULL, server, "startup server connection with user %s & database %s", route->id.user, route->id.database); for (size_t i = 0; i < route->rule->backend_startup_vars_sz; i++) { argv[i << 1].name = route->rule->backend_startup_vars[i].name; argv[i << 1].len = route->rule->backend_startup_vars[i].name_len + 1; argv[i << 1 | 1].name = route->rule->backend_startup_vars[i].value; argv[i << 1 | 1].len = route->rule->backend_startup_vars[i].value_len + 1; } int argc = route->rule->backend_startup_vars_sz * 2; for (size_t i = 0; i < DEFAULT_ARGV_SIZE; ++i) { argv[argc + i] = default_argv[i]; } argc += 4; if (route->id.physical_rep) { argv[argc + 1].name = "on"; argv[argc + 1].len = 3; argc += 2; } else if (route->id.logical_rep) { argv[argc + 1].name = "database"; argv[argc + 1].len = 9; argc += 2; } machine_msg_t *msg; msg = kiwi_fe_write_startup_message(NULL, argc, argv); if (msg == NULL) return -1; int rc; rc = od_write(&server->io, msg); if (rc == -1) { od_error(&instance->logger, "startup", NULL, server, "write error: %s", od_io_error(&server->io)); return -1; } /* update request count and sync state */ od_server_sync_request(server, 1); assert(server->client); while (1) { msg = od_read(&server->io, UINT32_MAX); if (msg == NULL) { od_error(&instance->logger, "startup", client, server, "read error: %s", od_io_error(&server->io)); return -1; } kiwi_be_type_t type = *(char *)machine_msg_data(msg); od_debug(&instance->logger, "startup", client, server, "received packet type: %s", kiwi_be_type_to_string(type)); switch (type) { case KIWI_BE_READY_FOR_QUERY: od_backend_ready(server, machine_msg_data(msg), machine_msg_size(msg)); machine_msg_free(msg); return 0; case KIWI_BE_AUTHENTICATION: rc = od_auth_backend(server, msg, client); machine_msg_free(msg); if (rc == -1) return -1; break; case KIWI_BE_BACKEND_KEY_DATA: rc = kiwi_fe_read_key(machine_msg_data(msg), machine_msg_size(msg), &server->key); machine_msg_free(msg); if (rc == -1) { od_error( &instance->logger, "startup", client, server, "failed to parse BackendKeyData message"); return -1; } break; case KIWI_BE_PARAMETER_STATUS: { char *name; uint32_t name_len; char *value; uint32_t value_len; rc = kiwi_fe_read_parameter(machine_msg_data(msg), machine_msg_size(msg), &name, &name_len, &value, &value_len); if (rc == -1) { machine_msg_free(msg); od_error( &instance->logger, "startup", client, server, "failed to parse ParameterStatus message"); return -1; } /* set server parameters */ kiwi_vars_update(&server->vars, name, name_len, value, value_len); if (route_params) { // skip volatile params // we skip in_hot_standby here because it may change // during connection lifetime, if server was // promoted if (name_len != sizeof("in_hot_standby") || strncmp(name, "in_hot_standby", name_len)) { kiwi_param_t *param; param = kiwi_param_allocate(name, name_len, value, value_len); if (param) kiwi_params_add(route_params, param); } } machine_msg_free(msg); break; } case KIWI_BE_NOTICE_RESPONSE: machine_msg_free(msg); break; case KIWI_BE_ERROR_RESPONSE: od_backend_error(server, "startup", machine_msg_data(msg), machine_msg_size(msg)); server->error_connect = msg; return -1; default: machine_msg_free(msg); od_debug(&instance->logger, "startup", client, server, "unexpected message: %s", kiwi_be_type_to_string(type)); return -1; } } od_unreachable(); return 0; } static inline int od_backend_connect_to(od_server_t *server, char *context, char *host, int port, od_tls_opts_t *tlsopts) { od_instance_t *instance = server->global->instance; assert(server->io.io == NULL); /* create io handle */ machine_io_t *io; io = machine_io_create(); if (io == NULL) return -1; /* set network options */ machine_set_nodelay(io, instance->config.nodelay); if (instance->config.keepalive > 0) { machine_set_keepalive(io, 1, instance->config.keepalive, instance->config.keepalive_keep_interval, instance->config.keepalive_probes, instance->config.keepalive_usr_timeout); } int rc; rc = od_io_prepare(&server->io, io, instance->config.readahead); if (rc == -1) { od_error(&instance->logger, context, NULL, server, "failed to set server io"); machine_close(io); machine_io_free(io); return -1; } /* set tls options */ if (tlsopts->tls_mode != OD_CONFIG_TLS_DISABLE) { server->tls = od_tls_backend(tlsopts); if (server->tls == NULL) return -1; } uint64_t time_connect_start = 0; if (instance->config.log_session) time_connect_start = machine_time_us(); struct sockaddr_un saddr_un; struct sockaddr_in saddr_v4; struct sockaddr_in6 saddr_v6; struct sockaddr *saddr; struct addrinfo *ai = NULL; /* resolve server address */ if (host) { /* assume IPv6 or IPv4 is specified */ int rc_resolve = -1; if (strchr(host, ':')) { /* v6 */ memset(&saddr_v6, 0, sizeof(saddr_v6)); saddr_v6.sin6_family = AF_INET6; saddr_v6.sin6_port = htons(port); rc_resolve = inet_pton(AF_INET6, host, &saddr_v6.sin6_addr); saddr = (struct sockaddr *)&saddr_v6; } else { /* v4 or hostname */ memset(&saddr_v4, 0, sizeof(saddr_v4)); saddr_v4.sin_family = AF_INET; saddr_v4.sin_port = htons(port); rc_resolve = inet_pton(AF_INET, host, &saddr_v4.sin_addr); saddr = (struct sockaddr *)&saddr_v4; } /* schedule getaddrinfo() execution */ if (rc_resolve != 1) { char rport[16]; od_snprintf(rport, sizeof(rport), "%d", port); rc = machine_getaddrinfo(host, rport, NULL, &ai, 0); if (rc != 0) { od_error(&instance->logger, context, NULL, server, "failed to resolve %s:%d", host, port); return NOT_OK_RESPONSE; } assert(ai != NULL); saddr = ai->ai_addr; } /* connected */ } else { /* set unix socket path */ memset(&saddr_un, 0, sizeof(saddr_un)); saddr_un.sun_family = AF_UNIX; saddr = (struct sockaddr *)&saddr_un; od_snprintf(saddr_un.sun_path, sizeof(saddr_un.sun_path), "%s/.s.PGSQL.%d", instance->config.unix_socket_dir, port); } uint64_t time_resolve = 0; if (instance->config.log_session) { time_resolve = machine_time_us() - time_connect_start; } /* connect to server */ rc = machine_connect(server->io.io, saddr, UINT32_MAX); if (ai) { freeaddrinfo(ai); } if (rc == NOT_OK_RESPONSE) { if (host) { od_error(&instance->logger, context, server->client, server, "failed to connect to %s:%d", host, port); } else { od_error(&instance->logger, context, server->client, server, "failed to connect to %s", saddr_un.sun_path); } return NOT_OK_RESPONSE; } /* do tls handshake */ if (tlsopts->tls_mode != OD_CONFIG_TLS_DISABLE) { rc = od_tls_backend_connect(server, &instance->logger, tlsopts); if (rc == NOT_OK_RESPONSE) { return NOT_OK_RESPONSE; } } uint64_t time_connect = 0; if (instance->config.log_session) { time_connect = machine_time_us() - time_connect_start; } /* log server connection */ if (instance->config.log_session) { if (host) { od_log(&instance->logger, context, server->client, server, "new server connection %s:%d (connect time: %d usec, " "resolve time: %d usec)", host, port, (int)time_connect, (int)time_resolve); } else { od_log(&instance->logger, context, server->client, server, "new server connection %s (connect time: %d usec, resolve " "time: %d usec)", saddr_un.sun_path, (int)time_connect, (int)time_resolve); } } return 0; } static inline int od_storage_parse_rw_check_response(machine_msg_t *msg) { char *pos = (char *)machine_msg_data(msg) + 1; uint32_t pos_size = machine_msg_size(msg) - 1; /* size */ uint32_t size; int rc; rc = kiwi_read32(&size, &pos, &pos_size); if (kiwi_unlikely(rc == -1)) goto error; /* count */ uint16_t count; rc = kiwi_read16(&count, &pos, &pos_size); if (kiwi_unlikely(rc == -1)) goto error; if (count != 1) goto error; /* (not used) */ uint32_t resp_len; rc = kiwi_read32(&resp_len, &pos, &pos_size); if (kiwi_unlikely(rc == -1)) { goto error; } /* we expect exactly one row */ if (resp_len != 1) { return NOT_OK_RESPONSE; } /* pg is in recovery false means db is open for write */ if (pos[0] == 'f') { return OK_RESPONSE; } /* fallthrough to error */ error: return NOT_OK_RESPONSE; } static inline od_retcode_t od_backend_attemp_connect_with_tsa( od_server_t *server, char *context, kiwi_params_t *route_params, char *host, int port, od_tls_opts_t *opts, od_target_session_attrs_t attrs, od_client_t *client) { assert(attrs == OD_TARGET_SESSION_ATTRS_RO || attrs == OD_TARGET_SESSION_ATTRS_RW); od_retcode_t rc; machine_msg_t *msg; rc = od_backend_connect_to(server, context, host, port, opts); if (rc == NOT_OK_RESPONSE) { od_backend_close_connection(server); return rc; } /* send startup and do initial configuration */ rc = od_backend_startup(server, route_params, client); if (rc == NOT_OK_RESPONSE) { od_backend_close_connection(server); return rc; } /* Check if server is read-write */ msg = od_query_do(server, context, "SELECT pg_is_in_recovery()", NULL); if (msg == NULL) { od_backend_close_connection(server); return NOT_OK_RESPONSE; } switch (attrs) { case OD_TARGET_SESSION_ATTRS_RW: rc = od_storage_parse_rw_check_response(msg); break; case OD_TARGET_SESSION_ATTRS_RO: /* this is primary, but we are forsed to find ro backend */ if (od_storage_parse_rw_check_response(msg) == OK_RESPONSE) { rc = NOT_OK_RESPONSE; } else { rc = OK_RESPONSE; } break; default: abort(); } machine_msg_free(msg); if (rc != OK_RESPONSE) { od_backend_close_connection(server); } return rc; } int od_backend_connect(od_server_t *server, char *context, kiwi_params_t *route_params, od_client_t *client) { od_route_t *route = server->route; assert(route != NULL); od_instance_t *instance = server->global->instance; od_rule_storage_t *storage; storage = route->rule->storage; /* connect to server */ od_retcode_t rc; size_t i; switch (storage->target_session_attrs) { case OD_TARGET_SESSION_ATTRS_RW: for (i = 0; i < storage->endpoints_count; ++i) { if (od_backend_attemp_connect_with_tsa( server, context, route_params, storage->endpoints[i].host, storage->endpoints[i].port, storage->tls_opts, OD_TARGET_SESSION_ATTRS_RW, client) == NOT_OK_RESPONSE) { /*backend connection not macthed by TSA */ assert(server->io.io == NULL); continue; } /* target host found! */ od_debug(&instance->logger, context, NULL, server, "primary found on %s:%d", storage->endpoints[i].host, storage->endpoints[i].port); server->endpoint_selector = i; return OK_RESPONSE; } od_debug(&instance->logger, context, NULL, server, "failed to find primary within %s", storage->host); return NOT_OK_RESPONSE; case OD_TARGET_SESSION_ATTRS_RO: for (i = 0; i < storage->endpoints_count; ++i) { if (od_backend_attemp_connect_with_tsa( server, context, route_params, storage->endpoints[i].host, storage->endpoints[i].port, storage->tls_opts, OD_TARGET_SESSION_ATTRS_RO, client) == NOT_OK_RESPONSE) { /*backend connection not macthed by TSA */ assert(server->io.io == NULL); continue; } /* target host found! */ od_debug(&instance->logger, context, NULL, server, "standby found on %s:%d", storage->endpoints[i].host, storage->endpoints[i].port); server->endpoint_selector = i; return OK_RESPONSE; } od_debug(&instance->logger, context, NULL, server, "failed to find standby within %s", storage->host); return NOT_OK_RESPONSE; case OD_TARGET_SESSION_ATTRS_ANY: /* fall throught */ default:; /* use rr_counter here */ char *host = NULL; /* For UNIX socket */ int port = storage->port; if (storage->endpoints_count) { host = storage->endpoints[0].host; if (storage->endpoints[0].port) port = storage->endpoints[0].port; } rc = od_backend_connect_to(server, context, host, port, storage->tls_opts); if (rc == NOT_OK_RESPONSE) { return NOT_OK_RESPONSE; } /* send startup and do initial configuration */ rc = od_backend_startup(server, route_params, client); if (rc == OK_RESPONSE) { server->endpoint_selector = 0; } return rc; } } int od_backend_connect_cancel(od_server_t *server, od_rule_storage_t *storage, kiwi_key_t *key) { od_instance_t *instance = server->global->instance; /* connect to server */ int rc; char *host = NULL; /* For UNIX socket */ int port = storage->port; if (storage->endpoints_count) { host = storage->endpoints[server->endpoint_selector].host; if (storage->endpoints[server->endpoint_selector].port) port = storage->endpoints[server->endpoint_selector] .port; } rc = od_backend_connect_to(server, "cancel", host, port, storage->tls_opts); if (rc == NOT_OK_RESPONSE) { return NOT_OK_RESPONSE; } /* send cancel request */ machine_msg_t *msg; msg = kiwi_fe_write_cancel(NULL, key->key_pid, key->key); if (msg == NULL) return -1; rc = od_write(&server->io, msg); if (rc == -1) { od_error(&instance->logger, "cancel", NULL, NULL, "write error: %s", od_io_error(&server->io)); return -1; } return 0; } int od_backend_update_parameter(od_server_t *server, char *context, char *data, uint32_t size, int server_only) { od_instance_t *instance = server->global->instance; od_client_t *client = server->client; char *name; uint32_t name_len; char *value; uint32_t value_len; int rc; rc = kiwi_fe_read_parameter(data, size, &name, &name_len, &value, &value_len); if (rc == -1) { od_error(&instance->logger, context, NULL, server, "failed to parse ParameterStatus message"); return -1; } /* update server only or client and server parameter */ od_debug(&instance->logger, context, client, server, "%.*s = %.*s", name_len, name, value_len, value); if (server_only) { kiwi_vars_update(&server->vars, name, name_len, value, value_len); } else { kiwi_vars_update_both(&client->vars, &server->vars, name, name_len, value, value_len); } return 0; } int od_backend_ready_wait(od_server_t *server, char *context, int count, uint32_t time_ms, uint32_t ignore_errors) { od_instance_t *instance = server->global->instance; int ready = 0; int query_rc; query_rc = 0; for (; !od_server_synchronized(server);) { machine_msg_t *msg; msg = od_read(&server->io, time_ms); if (msg == NULL) { if (!machine_timedout()) { od_error(&instance->logger, context, server->client, server, "read error: %s", od_io_error(&server->io)); } return -1; } kiwi_be_type_t type = *(char *)machine_msg_data(msg); od_debug(&instance->logger, context, server->client, server, "%s", kiwi_be_type_to_string(type)); if (type == KIWI_BE_PARAMETER_STATUS) { /* update server parameter */ int rc; rc = od_backend_update_parameter(server, context, machine_msg_data(msg), machine_msg_size(msg), 1); machine_msg_free(msg); if (rc == -1) { return -1; } } else if (type == KIWI_BE_ERROR_RESPONSE) { od_backend_error(server, context, machine_msg_data(msg), machine_msg_size(msg)); machine_msg_free(msg); if (!ignore_errors) { query_rc = -1; } } else if (type == KIWI_BE_READY_FOR_QUERY) { od_backend_ready(server, machine_msg_data(msg), machine_msg_size(msg)); machine_msg_free(msg); ready++; } else { machine_msg_free(msg); } } return query_rc; /* never reached */ } od_retcode_t od_backend_query_send(od_server_t *server, char *context, char *query, char *param, int len) { od_instance_t *instance = server->global->instance; machine_msg_t *msg; if (param) { msg = kiwi_fe_write_prep_stmt(NULL, query, param); } else { msg = kiwi_fe_write_query(NULL, query, len); } if (msg == NULL) { return NOT_OK_RESPONSE; } int rc; rc = od_write(&server->io, msg); if (rc == -1) { od_error(&instance->logger, context, server->client, server, "write error: %s", od_io_error(&server->io)); return NOT_OK_RESPONSE; } /* update server sync state */ od_server_sync_request(server, 1); assert(server->client); return OK_RESPONSE; } od_retcode_t od_backend_query(od_server_t *server, char *context, char *query, char *param, int len, uint32_t timeout, uint32_t count, uint32_t ignore_errors) { if (od_backend_query_send(server, context, query, param, len) == NOT_OK_RESPONSE) { return NOT_OK_RESPONSE; } od_retcode_t rc = od_backend_ready_wait(server, context, count, timeout, ignore_errors); return rc; }