match CREATE_MODE

This commit is contained in:
Elijah McMorris 2024-02-06 10:05:58 -08:00
parent 305bf5273b
commit 38fdee5728
Signed by: NexVeridian
SSH key fingerprint: SHA256:bsA1SKZxuEcEVHAy3gY1HUeM5ykRJl0U0kQHQn0hMg8
7 changed files with 87 additions and 60 deletions

View file

@ -22,6 +22,9 @@ Run tests with `cargo t`
Remove the cargo cache for buildkit with `docker builder prune --filter type=exec.cachemount` Remove the cargo cache for buildkit with `docker builder prune --filter type=exec.cachemount`
### View Progress
`docker attach wikidata-to-surrealdb`
# License # License
All code in this repository is dual-licensed under either [License-MIT](./LICENSE-MIT) or [LICENSE-APACHE](./LICENSE-Apache) at your option. This means you can select the license you prefer. [Why dual license](https://github.com/bevyengine/bevy/issues/2373). All code in this repository is dual-licensed under either [License-MIT](./LICENSE-MIT) or [LICENSE-APACHE](./LICENSE-Apache) at your option. This means you can select the license you prefer. [Why dual license](https://github.com/bevyengine/bevy/issues/2373).

View file

@ -3,7 +3,7 @@ A tool for converting Wikidata dumps to a [SurrealDB](https://surrealdb.com/) da
The surrealdb database is ~2.6GB uncompressed or 0.5GB compressed, while the bz2 file is ~80GB, gzip file is ~130GB, and the uncompressed json file is over 1TB. The surrealdb database is ~2.6GB uncompressed or 0.5GB compressed, while the bz2 file is ~80GB, gzip file is ~130GB, and the uncompressed json file is over 1TB.
Querying the entire database takes ~2 seconds per query. Building the database on a 7600k takes ~55 hours, using a cpu with more cores should be faster. Building the database on a 7600k takes ~55 hours, using ThreadedSingle, using a cpu with more cores should be faster.
# Getting The Data # Getting The Data
https://www.wikidata.org/wiki/Wikidata:Data_access https://www.wikidata.org/wiki/Wikidata:Data_access
@ -42,20 +42,32 @@ Create data folder next to docker-compose.yml and .env, place data inside, and s
DB_USER=root DB_USER=root
DB_PASSWORD=root DB_PASSWORD=root
WIKIDATA_LANG=en WIKIDATA_LANG=en
FILE_FORMAT=bz2 WIKIDATA_FILE_FORMAT=bz2
FILE_NAME=data/latest-all.json.bz2 WIKIDATA_FILE_NAME=data/latest-all.json.bz2
# If not using docker file for Wikidata to SurrealDB, use 0.0.0.0:8000 # If not using docker file for Wikidata to SurrealDB, use 0.0.0.0:8000
WIKIDATA_DB_PORT=surrealdb:8000 WIKIDATA_DB_PORT=surrealdb:8000
THREADED_REQUESTS=true
WIKIDATA_BULK_INSERT=true
# true=overwrite existing data, false=skip if already exists # true=overwrite existing data, false=skip if already exists
OVERWRITE_DB=false OVERWRITE_DB=false
INDIVIDUAL_WS=true CREATE_MODE=ThreadedSingle
```
Env string CREATE_MODE must be in the enum CreateMode
```
pub enum CreateMode {
Single,
ThreadedSingle,
ThreadedBulk, // Buggy
}
``` ```
# [Dev Install](./CONTRIBUTING.md#dev-install) # [Dev Install](./CONTRIBUTING.md#dev-install)
# How to Query # How to Query
```
namespace = wikidata
database = wikidata
```
## See [Useful queries.md](./Useful%20queries.md) ## See [Useful queries.md](./Useful%20queries.md)
# Table Schema # Table Schema

View file

@ -74,7 +74,7 @@ fn bench(c: &mut Criterion) {
criterion_group! { criterion_group! {
name = benches; name = benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Protobuf)).measurement_time(Duration::from_secs(60)); config = Criterion::default().with_profiler(PProfProfiler::new(120, Output::Protobuf)).measurement_time(Duration::from_secs(50));
targets= bench targets= bench
} }
criterion_main!(benches); criterion_main!(benches);

View file

@ -17,7 +17,7 @@ services:
deploy: deploy:
resources: resources:
reservations: reservations:
cpus: '0.5' cpus: '1'
ports: ports:
- 8000:8000 - 8000:8000
volumes: volumes:

View file

@ -17,7 +17,7 @@ services:
deploy: deploy:
resources: resources:
reservations: reservations:
cpus: '0.5' cpus: '1'
ports: ports:
- 8000:8000 - 8000:8000
volumes: volumes:

View file

@ -11,14 +11,22 @@ lazy_static! {
env::var("WIKIDATA_FILE_FORMAT").expect("FILE_FORMAT not set"); env::var("WIKIDATA_FILE_FORMAT").expect("FILE_FORMAT not set");
static ref WIKIDATA_FILE_NAME: String = static ref WIKIDATA_FILE_NAME: String =
env::var("WIKIDATA_FILE_NAME").expect("FILE_NAME not set"); env::var("WIKIDATA_FILE_NAME").expect("FILE_NAME not set");
static ref THREADED_REQUESTS: bool = env::var("THREADED_REQUESTS") static ref CREATE_MODE: CreateMode = match env::var("CREATE_MODE")
.expect("THREADED_REQUESTS not set") .expect("CREATE_MODE not set")
.parse() .as_str()
.expect("Failed to parse THREADED_REQUESTS"); {
static ref WIKIDATA_BULK_INSERT: bool = env::var("WIKIDATA_BULK_INSERT") "Single" => CreateMode::Single,
.expect("WIKIDATA_BULK_INSERT not set") "ThreadedSingle" => CreateMode::ThreadedSingle,
.parse() "ThreadedBulk" => CreateMode::ThreadedBulk,
.expect("Failed to parse WIKIDATA_BULK_INSERT"); _ => panic!("Unknown CREATE_MODE"),
};
}
#[derive(Clone, Copy)]
pub enum CreateMode {
Single,
ThreadedSingle,
ThreadedBulk,
} }
#[tokio::main] #[tokio::main]
@ -29,7 +37,8 @@ async fn main() -> Result<(), Error> {
let db = create_db_ws().await?; let db = create_db_ws().await?;
let reader = File_Format::new(&WIKIDATA_FILE_FORMAT).reader(&WIKIDATA_FILE_NAME)?; let reader = File_Format::new(&WIKIDATA_FILE_FORMAT).reader(&WIKIDATA_FILE_NAME)?;
if !*THREADED_REQUESTS { match *CREATE_MODE {
CreateMode::Single => {
let mut counter = 0; let mut counter = 0;
for line in reader.lines() { for line in reader.lines() {
let mut retries = 0; let mut retries = 0;
@ -54,27 +63,30 @@ async fn main() -> Result<(), Error> {
pb.inc(100); pb.inc(100);
} }
} }
} else if *WIKIDATA_BULK_INSERT { }
CreateMode::ThreadedSingle => {
create_db_entities_threaded( create_db_entities_threaded(
None::<Surreal<Client>>, None::<Surreal<Client>>,
reader, reader,
Some(pb.clone()), Some(pb.clone()),
2500, 2_500,
100,
CreateVersion::Bulk,
)
.await?;
} else {
create_db_entities_threaded(
None::<Surreal<Client>>,
reader,
Some(pb.clone()),
2500,
100, 100,
CreateVersion::Single, CreateVersion::Single,
) )
.await?; .await?;
} }
CreateMode::ThreadedBulk => {
create_db_entities_threaded(
None::<Surreal<Client>>,
reader,
Some(pb.clone()),
500,
1000,
CreateVersion::Bulk,
)
.await?;
}
}
pb.finish(); pb.finish();
Ok(()) Ok(())

View file

@ -80,7 +80,7 @@ pub async fn create_db_entity(db: &Surreal<impl Connection>, line: &str) -> Resu
pub async fn create_db_entities( pub async fn create_db_entities(
db: &Surreal<impl Connection>, db: &Surreal<impl Connection>,
lines: &Vec<String>, lines: &[String],
pb: &Option<ProgressBar>, pb: &Option<ProgressBar>,
) -> Result<(), Error> { ) -> Result<(), Error> {
let mut counter = 0; let mut counter = 0;
@ -168,7 +168,7 @@ impl CreateVersion {
pub async fn run( pub async fn run(
self, self,
db: &Surreal<impl Connection>, db: &Surreal<impl Connection>,
chunk: &Vec<String>, chunk: &[String],
pb: &Option<ProgressBar>, pb: &Option<ProgressBar>,
batch_size: usize, batch_size: usize,
) -> bool { ) -> bool {
@ -234,7 +234,7 @@ pub async fn create_db_entities_threaded(
panic!("Failed to create entities, too many retries"); panic!("Failed to create entities, too many retries");
} }
retries += 1; retries += 1;
sleep(Duration::from_millis(100)).await; sleep(Duration::from_millis(250)).await;
} }
})); }));
chunk_counter += 1; chunk_counter += 1;