From 38fdee57285d922f7b2cc86d1f366ecb263313a2 Mon Sep 17 00:00:00 2001 From: NexVeridian Date: Tue, 6 Feb 2024 10:05:58 -0800 Subject: [PATCH] match CREATE_MODE --- CONTRIBUTING.md | 3 ++ README.md | 24 ++++++--- benches/bench.rs | 2 +- docker-compose.dev.yml | 2 +- docker-compose.yml | 2 +- src/main.rs | 108 +++++++++++++++++++++++------------------ src/utils.rs | 6 +-- 7 files changed, 87 insertions(+), 60 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 55804f6..c767303 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -22,6 +22,9 @@ Run tests with `cargo t` Remove the cargo cache for buildkit with `docker builder prune --filter type=exec.cachemount` +### View Progress +`docker attach wikidata-to-surrealdb` + # License All code in this repository is dual-licensed under either [License-MIT](./LICENSE-MIT) or [LICENSE-APACHE](./LICENSE-Apache) at your option. This means you can select the license you prefer. [Why dual license](https://github.com/bevyengine/bevy/issues/2373). diff --git a/README.md b/README.md index 86fa9f8..87b5ef1 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ A tool for converting Wikidata dumps to a [SurrealDB](https://surrealdb.com/) da The surrealdb database is ~2.6GB uncompressed or 0.5GB compressed, while the bz2 file is ~80GB, gzip file is ~130GB, and the uncompressed json file is over 1TB. -Querying the entire database takes ~2 seconds per query. Building the database on a 7600k takes ~55 hours, using a cpu with more cores should be faster. +Building the database on a 7600k takes ~55 hours, using ThreadedSingle, using a cpu with more cores should be faster. # Getting The Data https://www.wikidata.org/wiki/Wikidata:Data_access @@ -42,20 +42,32 @@ Create data folder next to docker-compose.yml and .env, place data inside, and s DB_USER=root DB_PASSWORD=root WIKIDATA_LANG=en -FILE_FORMAT=bz2 -FILE_NAME=data/latest-all.json.bz2 +WIKIDATA_FILE_FORMAT=bz2 +WIKIDATA_FILE_NAME=data/latest-all.json.bz2 # If not using docker file for Wikidata to SurrealDB, use 0.0.0.0:8000 WIKIDATA_DB_PORT=surrealdb:8000 -THREADED_REQUESTS=true -WIKIDATA_BULK_INSERT=true # true=overwrite existing data, false=skip if already exists OVERWRITE_DB=false -INDIVIDUAL_WS=true +CREATE_MODE=ThreadedSingle +``` + +Env string CREATE_MODE must be in the enum CreateMode +``` +pub enum CreateMode { + Single, + ThreadedSingle, + ThreadedBulk, // Buggy +} ``` # [Dev Install](./CONTRIBUTING.md#dev-install) # How to Query +``` +namespace = wikidata +database = wikidata +``` + ## See [Useful queries.md](./Useful%20queries.md) # Table Schema diff --git a/benches/bench.rs b/benches/bench.rs index 33fb032..38b488d 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -74,7 +74,7 @@ fn bench(c: &mut Criterion) { criterion_group! { name = benches; - config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Protobuf)).measurement_time(Duration::from_secs(60)); + config = Criterion::default().with_profiler(PProfProfiler::new(120, Output::Protobuf)).measurement_time(Duration::from_secs(50)); targets= bench } criterion_main!(benches); diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 225c815..475a2fd 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -17,7 +17,7 @@ services: deploy: resources: reservations: - cpus: '0.5' + cpus: '1' ports: - 8000:8000 volumes: diff --git a/docker-compose.yml b/docker-compose.yml index 4483e59..911786e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -17,7 +17,7 @@ services: deploy: resources: reservations: - cpus: '0.5' + cpus: '1' ports: - 8000:8000 volumes: diff --git a/src/main.rs b/src/main.rs index 9f93af1..8f295e3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,14 +11,22 @@ lazy_static! { env::var("WIKIDATA_FILE_FORMAT").expect("FILE_FORMAT not set"); static ref WIKIDATA_FILE_NAME: String = env::var("WIKIDATA_FILE_NAME").expect("FILE_NAME not set"); - static ref THREADED_REQUESTS: bool = env::var("THREADED_REQUESTS") - .expect("THREADED_REQUESTS not set") - .parse() - .expect("Failed to parse THREADED_REQUESTS"); - static ref WIKIDATA_BULK_INSERT: bool = env::var("WIKIDATA_BULK_INSERT") - .expect("WIKIDATA_BULK_INSERT not set") - .parse() - .expect("Failed to parse WIKIDATA_BULK_INSERT"); + static ref CREATE_MODE: CreateMode = match env::var("CREATE_MODE") + .expect("CREATE_MODE not set") + .as_str() + { + "Single" => CreateMode::Single, + "ThreadedSingle" => CreateMode::ThreadedSingle, + "ThreadedBulk" => CreateMode::ThreadedBulk, + _ => panic!("Unknown CREATE_MODE"), + }; +} + +#[derive(Clone, Copy)] +pub enum CreateMode { + Single, + ThreadedSingle, + ThreadedBulk, } #[tokio::main] @@ -29,51 +37,55 @@ async fn main() -> Result<(), Error> { let db = create_db_ws().await?; let reader = File_Format::new(&WIKIDATA_FILE_FORMAT).reader(&WIKIDATA_FILE_NAME)?; - if !*THREADED_REQUESTS { - let mut counter = 0; - for line in reader.lines() { - let mut retries = 0; - let line = line?; + match *CREATE_MODE { + CreateMode::Single => { + let mut counter = 0; + for line in reader.lines() { + let mut retries = 0; + let line = line?; - loop { - if create_db_entity(&db, &line).await.is_ok() { - break; + loop { + if create_db_entity(&db, &line).await.is_ok() { + break; + } + if retries >= 60 * 10 { + panic!("Failed to create entities, too many retries"); + } + retries += 1; + sleep(Duration::from_secs(1)).await; + if db.use_ns("wikidata").use_db("wikidata").await.is_err() { + continue; + }; } - if retries >= 60 * 10 { - panic!("Failed to create entities, too many retries"); - } - retries += 1; - sleep(Duration::from_secs(1)).await; - if db.use_ns("wikidata").use_db("wikidata").await.is_err() { - continue; - }; - } - counter += 1; - if counter % 100 == 0 { - pb.inc(100); + counter += 1; + if counter % 100 == 0 { + pb.inc(100); + } } } - } else if *WIKIDATA_BULK_INSERT { - create_db_entities_threaded( - None::>, - reader, - Some(pb.clone()), - 2500, - 100, - CreateVersion::Bulk, - ) - .await?; - } else { - create_db_entities_threaded( - None::>, - reader, - Some(pb.clone()), - 2500, - 100, - CreateVersion::Single, - ) - .await?; + CreateMode::ThreadedSingle => { + create_db_entities_threaded( + None::>, + reader, + Some(pb.clone()), + 2_500, + 100, + CreateVersion::Single, + ) + .await?; + } + CreateMode::ThreadedBulk => { + create_db_entities_threaded( + None::>, + reader, + Some(pb.clone()), + 500, + 1000, + CreateVersion::Bulk, + ) + .await?; + } } pb.finish(); diff --git a/src/utils.rs b/src/utils.rs index 2e89453..dee9123 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -80,7 +80,7 @@ pub async fn create_db_entity(db: &Surreal, line: &str) -> Resu pub async fn create_db_entities( db: &Surreal, - lines: &Vec, + lines: &[String], pb: &Option, ) -> Result<(), Error> { let mut counter = 0; @@ -168,7 +168,7 @@ impl CreateVersion { pub async fn run( self, db: &Surreal, - chunk: &Vec, + chunk: &[String], pb: &Option, batch_size: usize, ) -> bool { @@ -234,7 +234,7 @@ pub async fn create_db_entities_threaded( panic!("Failed to create entities, too many retries"); } retries += 1; - sleep(Duration::from_millis(100)).await; + sleep(Duration::from_millis(250)).await; } })); chunk_counter += 1;