mirror of
https://github.com/NexVeridian/wikidata-to-surrealdb.git
synced 2025-09-01 17:39:12 +00:00
bulk insert and benchmarks
This commit is contained in:
parent
dc85c7d997
commit
82edfdfbd3
12 changed files with 289 additions and 54 deletions
|
@ -11,3 +11,4 @@ docker-compose.dev.yml
|
|||
docker-compose.yml
|
||||
dockerfile
|
||||
*.md
|
||||
.torrent
|
||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -106,3 +106,4 @@ venv.bak/
|
|||
/data
|
||||
/target
|
||||
Cargo.lock
|
||||
.torrent
|
||||
|
|
|
@ -17,7 +17,7 @@ Run tests with `cargo t`
|
|||
## Docker Compose
|
||||
`git clone`
|
||||
|
||||
`docker compose -f docker-compose.dev.yml build && docker compose -f docker-compose.dev.yml up`
|
||||
`docker compose -f docker-compose.dev.yml build && docker compose -f docker-compose.dev.yml up --pull always -d`
|
||||
|
||||
Remove the cargo cache for buildkit with `docker builder prune --filter type=exec.cachemount`
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ license = "MIT OR Apache-2.0"
|
|||
anyhow = "1.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
surrealdb = "1.0"
|
||||
surrealdb = "1.1"
|
||||
tokio = { version = "1.35", features = ["time"] }
|
||||
futures = "0.3"
|
||||
wikidata = "0.3.1"
|
||||
|
@ -17,4 +17,9 @@ lazy_static = "1.4"
|
|||
indicatif = "0.17"
|
||||
|
||||
[dev-dependencies]
|
||||
surrealdb = { version = "1.0", features = ["kv-mem"] }
|
||||
surrealdb = { version = "1.1", features = ["kv-mem"] }
|
||||
criterion = { version = "0.5", features = ["async_tokio"] }
|
||||
|
||||
[[bench]]
|
||||
name = "bench"
|
||||
harness = false
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
Copyright 2023, Elijah McMorris ( NexVeridian )
|
||||
Copyright 2024, Elijah McMorris ( NexVeridian )
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
|
@ -177,7 +177,7 @@ Copyright 2023, Elijah McMorris ( NexVeridian )
|
|||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
Copyright 2023, Elijah McMorris ( NexVeridian )
|
||||
Copyright 2024, Elijah McMorris ( NexVeridian )
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2023 Elijah McMorris (NexVeridian)
|
||||
Copyright (c) 2024 Elijah McMorris (NexVeridian)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
# Wikidata to SurrealDB
|
||||
A tool for converting Wikidata dumps to a [SurrealDB](https://surrealdb.com/) database. Either From a bz2 or json file.
|
||||
A tool for converting Wikidata dumps to a [SurrealDB](https://surrealdb.com/) database. Either From a bz2 or json file.
|
||||
|
||||
The surrealdb database is ~2.6GB uncompressed or 0.5GB compressed, while the bz2 file is ~80GB, gzip file is ~130GB, and the uncompressed json file is over 1TB.
|
||||
|
||||
Querying the entire database takes ~2 seconds per query. Building the database on a 7600k takes ~55 hours, using a cpu with more cores should be faster.
|
||||
|
||||
# Getting The Data
|
||||
https://www.wikidata.org/wiki/Wikidata:Data_access
|
||||
|
@ -43,8 +47,10 @@ FILE_NAME=data/latest-all.json.bz2
|
|||
# If not using docker file for Wikidata to SurrealDB, use 0.0.0.0:8000
|
||||
WIKIDATA_DB_PORT=surrealdb:8000
|
||||
THREADED_REQUESTS=true
|
||||
WIKIDATA_BULK_INSERT=true
|
||||
# true=overwrite existing data, false=skip if already exists
|
||||
OVERWRITE_DB=false
|
||||
INDIVIDUAL_WS=true
|
||||
```
|
||||
|
||||
# [Dev Install](./CONTRIBUTING.md#dev-install)
|
||||
|
|
79
benches/bench.rs
Normal file
79
benches/bench.rs
Normal file
|
@ -0,0 +1,79 @@
|
|||
use anyhow::{Error, Ok, Result};
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use std::{env, time::Duration};
|
||||
use surrealdb::{
|
||||
engine::local::{Db, Mem},
|
||||
Surreal,
|
||||
};
|
||||
use tokio::runtime::Runtime;
|
||||
|
||||
use wikidata_to_surrealdb::utils::*;
|
||||
|
||||
async fn inti_db() -> Result<Surreal<Db>, Error> {
|
||||
env::set_var("WIKIDATA_LANG", "en");
|
||||
env::set_var("OVERWRITE_DB", "true");
|
||||
|
||||
let db = Surreal::new::<Mem>(()).await?;
|
||||
db.use_ns("wikidata").use_db("wikidata").await?;
|
||||
|
||||
Ok(db)
|
||||
}
|
||||
|
||||
fn bench(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("Create DB Entities");
|
||||
|
||||
group.bench_function("Single Insert", |b| {
|
||||
b.iter(|| {
|
||||
let rt = Runtime::new().unwrap();
|
||||
rt.block_on(async {
|
||||
let db = inti_db().await.unwrap();
|
||||
let reader = File_Format::new("json")
|
||||
.reader("tests/data/bench.json")
|
||||
.unwrap();
|
||||
|
||||
create_db_entities_threaded(
|
||||
Some(db.clone()),
|
||||
reader,
|
||||
None,
|
||||
1000,
|
||||
100,
|
||||
CreateVersion::Single,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
})
|
||||
})
|
||||
});
|
||||
|
||||
group.bench_function("Bulk Insert", |b| {
|
||||
b.iter(|| {
|
||||
let rt = Runtime::new().unwrap();
|
||||
rt.block_on(async {
|
||||
let db = inti_db().await.unwrap();
|
||||
let reader = File_Format::new("json")
|
||||
.reader("tests/data/bench.json")
|
||||
.unwrap();
|
||||
|
||||
create_db_entities_threaded(
|
||||
Some(db.clone()),
|
||||
reader,
|
||||
None,
|
||||
1000,
|
||||
100,
|
||||
CreateVersion::Bulk,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
})
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = benches;
|
||||
config = Criterion::default().measurement_time(Duration::from_secs(60));
|
||||
targets= bench
|
||||
}
|
||||
criterion_main!(benches);
|
43
src/main.rs
43
src/main.rs
|
@ -1,7 +1,6 @@
|
|||
use anyhow::{Error, Ok, Result};
|
||||
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
||||
use lazy_static::lazy_static;
|
||||
use std::{env, fmt::Write, io::BufRead};
|
||||
use std::{env, io::BufRead};
|
||||
use surrealdb::{engine::remote::ws::Client, Surreal};
|
||||
use tokio::time::{sleep, Duration};
|
||||
mod utils;
|
||||
|
@ -16,25 +15,16 @@ lazy_static! {
|
|||
.expect("THREADED_REQUESTS not set")
|
||||
.parse()
|
||||
.expect("Failed to parse THREADED_REQUESTS");
|
||||
static ref WIKIDATA_BULK_INSERT: bool = env::var("WIKIDATA_BULK_INSERT")
|
||||
.expect("WIKIDATA_BULK_INSERT not set")
|
||||
.parse()
|
||||
.expect("Failed to parse WIKIDATA_BULK_INSERT");
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Error> {
|
||||
sleep(Duration::from_secs(10)).await;
|
||||
let total_size = 113_000_000;
|
||||
|
||||
let pb = ProgressBar::new(total_size);
|
||||
pb.set_style(
|
||||
ProgressStyle::with_template(
|
||||
"[{elapsed_precise}] [{wide_bar:.cyan/blue}] {human_pos}/{human_len} ETA:[{eta}]",
|
||||
)?
|
||||
.with_key("eta", |state: &ProgressState, w: &mut dyn Write| {
|
||||
let sec = state.eta().as_secs();
|
||||
let min = (sec / 60) % 60;
|
||||
let hr = (sec / 60) / 60;
|
||||
write!(w, "{}:{:02}:{:02}", hr, min, sec % 60).unwrap()
|
||||
}),
|
||||
);
|
||||
let pb = create_pb().await;
|
||||
|
||||
let db = create_db_ws().await?;
|
||||
let reader = File_Format::new(&WIKIDATA_FILE_FORMAT).reader(&WIKIDATA_FILE_NAME)?;
|
||||
|
@ -64,9 +54,26 @@ async fn main() -> Result<(), Error> {
|
|||
pb.inc(100);
|
||||
}
|
||||
}
|
||||
} else if *WIKIDATA_BULK_INSERT {
|
||||
create_db_entities_threaded(
|
||||
None::<Surreal<Client>>,
|
||||
reader,
|
||||
Some(pb.clone()),
|
||||
2500,
|
||||
100,
|
||||
CreateVersion::Bulk,
|
||||
)
|
||||
.await?;
|
||||
} else {
|
||||
create_db_entities_threaded(None::<Surreal<Client>>, reader, Some(pb.clone()), 2500, 100)
|
||||
.await?;
|
||||
create_db_entities_threaded(
|
||||
None::<Surreal<Client>>,
|
||||
reader,
|
||||
Some(pb.clone()),
|
||||
2500,
|
||||
100,
|
||||
CreateVersion::Single,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
pb.finish();
|
||||
|
|
114
src/utils.rs
114
src/utils.rs
|
@ -1,7 +1,7 @@
|
|||
use anyhow::{Error, Result};
|
||||
use bzip2::read::MultiBzDecoder;
|
||||
use futures::future::join_all;
|
||||
use indicatif::ProgressBar;
|
||||
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
||||
use lazy_static::lazy_static;
|
||||
use serde_json::{from_str, Value};
|
||||
use std::{
|
||||
|
@ -96,21 +96,95 @@ pub async fn create_db_entities(
|
|||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn create_db_entities_bulk(
|
||||
db: &Surreal<impl Connection>,
|
||||
lines: &[String],
|
||||
pb: &Option<ProgressBar>,
|
||||
) -> Result<(), Error> {
|
||||
let lines = lines
|
||||
.iter()
|
||||
.map(|line| line.trim().trim_end_matches(',').to_string())
|
||||
.filter(|line| line != "[" && line != "]")
|
||||
.collect::<Vec<String>>();
|
||||
|
||||
let mut data_vec: Vec<EntityMini> = Vec::new();
|
||||
let mut claims_vec: Vec<Claims> = Vec::new();
|
||||
let mut property_vec: Vec<EntityMini> = Vec::new();
|
||||
let mut lexeme_vec: Vec<EntityMini> = Vec::new();
|
||||
|
||||
for line in lines {
|
||||
let json: Value = from_str(&line).expect("Failed to parse JSON");
|
||||
let data = Entity::from_json(json).expect("Failed to parse JSON");
|
||||
let (claims, data) = EntityMini::from_entity(data);
|
||||
match data.id.clone().expect("No ID").tb.as_str() {
|
||||
"Property" => property_vec.push(data),
|
||||
"Lexeme" => lexeme_vec.push(data),
|
||||
"Entity" => data_vec.push(data),
|
||||
_ => panic!("Unknown table"),
|
||||
}
|
||||
claims_vec.push(claims);
|
||||
}
|
||||
|
||||
db.query("insert into Entity ($data_vec) RETURN NONE;")
|
||||
.bind(("data_vec", data_vec))
|
||||
.await?;
|
||||
db.query("insert into Claims ($claims_vec) RETURN NONE;")
|
||||
.bind(("claims_vec", claims_vec))
|
||||
.await?;
|
||||
db.query("insert into Property ($property_vec) RETURN NONE;")
|
||||
.bind(("property_vec", property_vec))
|
||||
.await?;
|
||||
db.query("insert into Lexeme ($lexeme_vec) RETURN NONE;")
|
||||
.bind(("lexeme_vec", lexeme_vec))
|
||||
.await?;
|
||||
|
||||
if let Some(ref p) = pb {
|
||||
p.inc(100)
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum CreateVersion {
|
||||
Single,
|
||||
Bulk,
|
||||
}
|
||||
impl CreateVersion {
|
||||
pub async fn run(
|
||||
self,
|
||||
db: &Surreal<impl Connection>,
|
||||
chunk: &Vec<String>,
|
||||
pb: &Option<ProgressBar>,
|
||||
) -> bool {
|
||||
match self {
|
||||
CreateVersion::Single => create_db_entities(db, chunk, pb).await.is_ok(),
|
||||
CreateVersion::Bulk => create_db_entities_bulk(db, chunk, pb).await.is_ok(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn create_db_entities_threaded(
|
||||
dbo: Option<Surreal<impl Connection>>, // None::<Surreal<Client>>
|
||||
reader: Box<dyn BufRead>,
|
||||
pb: Option<ProgressBar>,
|
||||
batch_size: usize,
|
||||
batch_num: usize,
|
||||
create_version: CreateVersion,
|
||||
) -> Result<(), Error> {
|
||||
let mut futures = Vec::new();
|
||||
let mut chunk = Vec::new();
|
||||
let mut chunk_counter = 0;
|
||||
let mut lines = reader.lines();
|
||||
let mut last_loop = false;
|
||||
|
||||
for line in reader.lines() {
|
||||
chunk.push(line?);
|
||||
loop {
|
||||
let line = lines.next();
|
||||
match line {
|
||||
Some(line) => chunk.push(line?),
|
||||
None => last_loop = true,
|
||||
};
|
||||
|
||||
if chunk.len() >= batch_size {
|
||||
if chunk.len() >= batch_size || last_loop {
|
||||
let dbo = dbo.clone();
|
||||
let pb = pb.clone();
|
||||
|
||||
|
@ -119,7 +193,7 @@ pub async fn create_db_entities_threaded(
|
|||
loop {
|
||||
match dbo {
|
||||
Some(ref db) => {
|
||||
if create_db_entities(db, &chunk, &pb).await.is_ok() {
|
||||
if create_version.run(db, &chunk, &pb).await {
|
||||
break;
|
||||
}
|
||||
if db.use_ns("wikidata").use_db("wikidata").await.is_err() {
|
||||
|
@ -132,7 +206,7 @@ pub async fn create_db_entities_threaded(
|
|||
} else {
|
||||
continue;
|
||||
};
|
||||
if create_db_entities(&db, &chunk, &pb).await.is_ok() {
|
||||
if create_version.run(&db, &chunk, &pb).await {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -142,18 +216,21 @@ pub async fn create_db_entities_threaded(
|
|||
panic!("Failed to create entities, too many retries");
|
||||
}
|
||||
retries += 1;
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
}));
|
||||
chunk_counter += 1;
|
||||
chunk = Vec::new();
|
||||
}
|
||||
|
||||
if chunk_counter >= batch_num {
|
||||
if chunk_counter >= batch_num || last_loop {
|
||||
join_all(futures).await;
|
||||
futures = Vec::new();
|
||||
chunk_counter = 0;
|
||||
}
|
||||
if last_loop {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
match dbo {
|
||||
|
@ -180,3 +257,24 @@ pub async fn create_db_ws() -> Result<Surreal<Client>, Error> {
|
|||
|
||||
Ok(db)
|
||||
}
|
||||
|
||||
pub async fn create_pb() -> ProgressBar {
|
||||
let total_size = 110_000_000;
|
||||
let pb = ProgressBar::new(total_size);
|
||||
pb.set_style(
|
||||
ProgressStyle::with_template(
|
||||
"[{elapsed_precise}] [{wide_bar:.cyan/blue}] {human_pos}/{human_len} ETA:[{eta}]",
|
||||
)
|
||||
.unwrap()
|
||||
.with_key(
|
||||
"eta",
|
||||
|state: &ProgressState, w: &mut dyn std::fmt::Write| {
|
||||
let sec = state.eta().as_secs();
|
||||
let min = (sec / 60) % 60;
|
||||
let hr = (sec / 60) / 60;
|
||||
write!(w, "{}:{:02}:{:02}", hr, min, sec % 60).unwrap()
|
||||
},
|
||||
),
|
||||
);
|
||||
pb
|
||||
}
|
||||
|
|
28
tests/data/bench.json
Normal file
28
tests/data/bench.json
Normal file
File diff suppressed because one or more lines are too long
|
@ -47,27 +47,29 @@ async fn entity() {
|
|||
assert_eq!(51.0, entity_query(&db).await.unwrap().unwrap())
|
||||
}
|
||||
|
||||
async fn entity_threaded_insert(create_version: CreateVersion) -> Result<Surreal<Db>, Error> {
|
||||
let db = inti_db().await?;
|
||||
let reader = File_Format::new("json").reader("tests/data/Entity.json")?;
|
||||
|
||||
create_db_entities_threaded(Some(db.clone()), reader, None, 1000, 100, create_version).await?;
|
||||
Ok(db)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn entity_threaded() {
|
||||
let db = inti_db().await.unwrap();
|
||||
let reader = File_Format::new("json")
|
||||
.reader("tests/data/Entity.json")
|
||||
.unwrap();
|
||||
|
||||
create_db_entities_threaded(Some(db.clone()), reader, None, 1000, 100)
|
||||
.await
|
||||
.unwrap();
|
||||
let db = entity_threaded_insert(CreateVersion::Single).await.unwrap();
|
||||
assert_eq!(51.0, entity_query(&db).await.unwrap().unwrap())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn entity_threaded_bulk_insert() {
|
||||
let db = entity_threaded_insert(CreateVersion::Bulk).await.unwrap();
|
||||
assert_eq!(51.0, entity_query(&db).await.unwrap().unwrap())
|
||||
}
|
||||
|
||||
async fn property_query(db: &Surreal<Db>) -> Result<Option<f32>, Error> {
|
||||
let x: Option<f32> = db
|
||||
.query(
|
||||
r#"
|
||||
return count(select * from Property);
|
||||
"#,
|
||||
)
|
||||
.query("return count(select * from Property);")
|
||||
.await
|
||||
.unwrap()
|
||||
.take(0)
|
||||
|
@ -89,16 +91,24 @@ async fn property() {
|
|||
assert_eq!(2.0, property_query(&db).await.unwrap().unwrap())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn property_threaded() {
|
||||
let db = inti_db().await.unwrap();
|
||||
let reader = File_Format::new("json")
|
||||
.reader("tests/data/Property.json")
|
||||
.unwrap();
|
||||
async fn property_threaded_insert(create_version: CreateVersion) -> Result<Surreal<Db>, Error> {
|
||||
let db = inti_db().await?;
|
||||
let reader = File_Format::new("json").reader("tests/data/Property.json")?;
|
||||
|
||||
create_db_entities_threaded(Some(db.clone()), reader, None, 1000, 100)
|
||||
create_db_entities_threaded(Some(db.clone()), reader, None, 1000, 100, create_version).await?;
|
||||
Ok(db)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn property_threaded_single_insert() {
|
||||
let db = property_threaded_insert(CreateVersion::Single)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(2.0, property_query(&db).await.unwrap().unwrap())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn property_threaded_bulk_insert() {
|
||||
let db = property_threaded_insert(CreateVersion::Bulk).await.unwrap();
|
||||
assert_eq!(2.0, property_query(&db).await.unwrap().unwrap())
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue