-
Notifications
You must be signed in to change notification settings - Fork 498
Description
I was having issues with multi row insertion using SOCI (namely speed). I've read the docs and I believe I'm following the best practices. I created my own simple test to try to figure out what the best way to do this was. I came up with 5 methods (the first 4 are using examples provided in the SOCI documentation) and the last method was just formatting a big long string with all the data. I was very surprised that method 5 (see code below) is 40x faster than all the other methods and I was even more surprised that the methods that were supposedly built for handling multiple rows at once showed the same performance as inserting each row one at a time.
Here's the test I came up with:
#include <iostream>
#include <soci.h>
#include <postgresql/soci-postgresql.h>
#include <chrono>
#include <random>
int main() {
{
soci::session s(soci::postgresql, "dbname=postgres user=postgres");
s << "DROP DATABASE IF EXISTS performance_test;";
s << "CREATE DATABASE performance_test;";
}
soci::session sql(soci::postgresql, "dbname=performance_test user=postgres");
sql << "CREATE TABLE method1(id BIGSERIAL NOT NULL, ival BIGINT NOT NULL, dval FLOAT(53) NOT NULL);";
sql << "CREATE TABLE method2(id BIGSERIAL NOT NULL, ival BIGINT NOT NULL, dval FLOAT(53) NOT NULL);";
sql << "CREATE TABLE method3(id BIGSERIAL NOT NULL, ival BIGINT NOT NULL, dval FLOAT(53) NOT NULL);";
sql << "CREATE TABLE method4(id BIGSERIAL NOT NULL, ival BIGINT NOT NULL, dval FLOAT(53) NOT NULL);";
sql << "CREATE TABLE method5(id BIGSERIAL NOT NULL, ival BIGINT NOT NULL, dval FLOAT(53) NOT NULL);";
size_t num_insertions = 1000;
size_t insertion_size = 25;
double method1_time = 0.0;
double method2_time = 0.0;
double method3_time = 0.0;
double method4_time = 0.0;
double method5_time = 0.0;
for (size_t i=0; i < num_insertions; ++i) {
std::vector<int> ints(insertion_size);
std::vector<double> doubles(insertion_size);
std::random_device rd;
std::mt19937 gen(rd()); // Mersenne Twister generator
std::uniform_real_distribution<> dis(-1000.0, 1000.0); // Uniform distribution for doubles
// Fill the vector with random doubles
for (size_t j = 0; j < insertion_size; ++j) {
ints.push_back(dis(gen));
doubles.push_back(dis(gen));
}
std::chrono::time_point<std::chrono::high_resolution_clock> now, end;
// method1
now = std::chrono::high_resolution_clock::now();
for (size_t j=0; j < ints.size(); ++j) {
sql << "INSERT INTO method1 (ival, dval) VALUES (" << ints[j] << ", " << doubles[j] << ")";
}
end = std::chrono::high_resolution_clock::now();
method1_time += std::chrono::duration<double>(end-now).count();
// method2
now = std::chrono::high_resolution_clock::now();
for (size_t j=0; j < ints.size(); ++j) {
sql << "INSERT INTO method2 (ival, dval) VALUES (:ival, :dval)", soci::use(ints[j]), soci::use(doubles[j]);
}
end = std::chrono::high_resolution_clock::now();
method2_time += std::chrono::duration<double>(end-now).count();
// method3
now = std::chrono::high_resolution_clock::now();
int ival;
double dval;
soci::statement st3 = (sql.prepare <<
"INSERT INTO method3 (ival, dval) VALUES (:ival, :dval)",
soci::use(ival), soci::use(dval));
for (size_t j=0; j < ints.size(); ++j) {
ival = ints[j];
dval = doubles[j];
st3.execute(true);
}
end = std::chrono::high_resolution_clock::now();
method3_time += std::chrono::duration<double>(end-now).count();
// method4
now = std::chrono::high_resolution_clock::now();
soci::statement st4 = (sql.prepare <<
"INSERT INTO method4 (ival, dval) VALUES (:ivals, :dvals)",
soci::use(ints), soci::use(doubles));
st4.execute(true);
end = std::chrono::high_resolution_clock::now();
method4_time += std::chrono::duration<double>(end-now).count();
// method5
now = std::chrono::high_resolution_clock::now();
std::string str = "INSERT INTO method5 (ival, dval) VALUES";
for (size_t j=0; j < ints.size(); ++j) {
str += " (" + std::to_string(ints[j]) + ", " + std::to_string(doubles[j]) + "),";
}
str.pop_back();
sql << str;
end = std::chrono::high_resolution_clock::now();
method5_time += std::chrono::duration<double>(end-now).count();
if (i%50 == 0) {
std::cout << static_cast<double>(i) / static_cast<double>(num_insertions) << std::endl;
}
}
std::cout << "Method1: " << method1_time << std::endl;
std::cout << "Method2: " << method2_time << std::endl;
std::cout << "Method3: " << method3_time << std::endl;
std::cout << "Method4: " << method4_time << std::endl;
std::cout << "Method5: " << method5_time << std::endl;
return 0;
}
This yielded the following cumulative time output (in seconds):
Method1: 85.6718
Method2: 86.5225
Method3: 85.0977
Method4: 84.6804
Method5: 1.94613
I realize that method 5 can probably be optimized a little more by using stringstream, but the bigger question I have is: am I doing something wrong here? I wouldn't expect the SOCI methods to be 40x slower than method 5. Furthermore, I'd expect the SOCI vector operations to at least be noticeably faster than looping over an insert statement a bunch of times.
Any insight here would be greatly appreciated! Thanks