mirror of
https://github.com/NexVeridian/wikidata-to-surrealdb.git
synced 2025-09-02 09:59:13 +00:00
match CREATE_MODE
This commit is contained in:
parent
305bf5273b
commit
38fdee5728
7 changed files with 87 additions and 60 deletions
|
@ -22,6 +22,9 @@ Run tests with `cargo t`
|
||||||
|
|
||||||
Remove the cargo cache for buildkit with `docker builder prune --filter type=exec.cachemount`
|
Remove the cargo cache for buildkit with `docker builder prune --filter type=exec.cachemount`
|
||||||
|
|
||||||
|
### View Progress
|
||||||
|
`docker attach wikidata-to-surrealdb`
|
||||||
|
|
||||||
# License
|
# License
|
||||||
All code in this repository is dual-licensed under either [License-MIT](./LICENSE-MIT) or [LICENSE-APACHE](./LICENSE-Apache) at your option. This means you can select the license you prefer. [Why dual license](https://github.com/bevyengine/bevy/issues/2373).
|
All code in this repository is dual-licensed under either [License-MIT](./LICENSE-MIT) or [LICENSE-APACHE](./LICENSE-Apache) at your option. This means you can select the license you prefer. [Why dual license](https://github.com/bevyengine/bevy/issues/2373).
|
||||||
|
|
||||||
|
|
24
README.md
24
README.md
|
@ -3,7 +3,7 @@ A tool for converting Wikidata dumps to a [SurrealDB](https://surrealdb.com/) da
|
||||||
|
|
||||||
The surrealdb database is ~2.6GB uncompressed or 0.5GB compressed, while the bz2 file is ~80GB, gzip file is ~130GB, and the uncompressed json file is over 1TB.
|
The surrealdb database is ~2.6GB uncompressed or 0.5GB compressed, while the bz2 file is ~80GB, gzip file is ~130GB, and the uncompressed json file is over 1TB.
|
||||||
|
|
||||||
Querying the entire database takes ~2 seconds per query. Building the database on a 7600k takes ~55 hours, using a cpu with more cores should be faster.
|
Building the database on a 7600k takes ~55 hours, using ThreadedSingle, using a cpu with more cores should be faster.
|
||||||
|
|
||||||
# Getting The Data
|
# Getting The Data
|
||||||
https://www.wikidata.org/wiki/Wikidata:Data_access
|
https://www.wikidata.org/wiki/Wikidata:Data_access
|
||||||
|
@ -42,20 +42,32 @@ Create data folder next to docker-compose.yml and .env, place data inside, and s
|
||||||
DB_USER=root
|
DB_USER=root
|
||||||
DB_PASSWORD=root
|
DB_PASSWORD=root
|
||||||
WIKIDATA_LANG=en
|
WIKIDATA_LANG=en
|
||||||
FILE_FORMAT=bz2
|
WIKIDATA_FILE_FORMAT=bz2
|
||||||
FILE_NAME=data/latest-all.json.bz2
|
WIKIDATA_FILE_NAME=data/latest-all.json.bz2
|
||||||
# If not using docker file for Wikidata to SurrealDB, use 0.0.0.0:8000
|
# If not using docker file for Wikidata to SurrealDB, use 0.0.0.0:8000
|
||||||
WIKIDATA_DB_PORT=surrealdb:8000
|
WIKIDATA_DB_PORT=surrealdb:8000
|
||||||
THREADED_REQUESTS=true
|
|
||||||
WIKIDATA_BULK_INSERT=true
|
|
||||||
# true=overwrite existing data, false=skip if already exists
|
# true=overwrite existing data, false=skip if already exists
|
||||||
OVERWRITE_DB=false
|
OVERWRITE_DB=false
|
||||||
INDIVIDUAL_WS=true
|
CREATE_MODE=ThreadedSingle
|
||||||
|
```
|
||||||
|
|
||||||
|
Env string CREATE_MODE must be in the enum CreateMode
|
||||||
|
```
|
||||||
|
pub enum CreateMode {
|
||||||
|
Single,
|
||||||
|
ThreadedSingle,
|
||||||
|
ThreadedBulk, // Buggy
|
||||||
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
# [Dev Install](./CONTRIBUTING.md#dev-install)
|
# [Dev Install](./CONTRIBUTING.md#dev-install)
|
||||||
|
|
||||||
# How to Query
|
# How to Query
|
||||||
|
```
|
||||||
|
namespace = wikidata
|
||||||
|
database = wikidata
|
||||||
|
```
|
||||||
|
|
||||||
## See [Useful queries.md](./Useful%20queries.md)
|
## See [Useful queries.md](./Useful%20queries.md)
|
||||||
|
|
||||||
# Table Schema
|
# Table Schema
|
||||||
|
|
|
@ -74,7 +74,7 @@ fn bench(c: &mut Criterion) {
|
||||||
|
|
||||||
criterion_group! {
|
criterion_group! {
|
||||||
name = benches;
|
name = benches;
|
||||||
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Protobuf)).measurement_time(Duration::from_secs(60));
|
config = Criterion::default().with_profiler(PProfProfiler::new(120, Output::Protobuf)).measurement_time(Duration::from_secs(50));
|
||||||
targets= bench
|
targets= bench
|
||||||
}
|
}
|
||||||
criterion_main!(benches);
|
criterion_main!(benches);
|
||||||
|
|
|
@ -17,7 +17,7 @@ services:
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
reservations:
|
reservations:
|
||||||
cpus: '0.5'
|
cpus: '1'
|
||||||
ports:
|
ports:
|
||||||
- 8000:8000
|
- 8000:8000
|
||||||
volumes:
|
volumes:
|
||||||
|
|
|
@ -17,7 +17,7 @@ services:
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
reservations:
|
reservations:
|
||||||
cpus: '0.5'
|
cpus: '1'
|
||||||
ports:
|
ports:
|
||||||
- 8000:8000
|
- 8000:8000
|
||||||
volumes:
|
volumes:
|
||||||
|
|
108
src/main.rs
108
src/main.rs
|
@ -11,14 +11,22 @@ lazy_static! {
|
||||||
env::var("WIKIDATA_FILE_FORMAT").expect("FILE_FORMAT not set");
|
env::var("WIKIDATA_FILE_FORMAT").expect("FILE_FORMAT not set");
|
||||||
static ref WIKIDATA_FILE_NAME: String =
|
static ref WIKIDATA_FILE_NAME: String =
|
||||||
env::var("WIKIDATA_FILE_NAME").expect("FILE_NAME not set");
|
env::var("WIKIDATA_FILE_NAME").expect("FILE_NAME not set");
|
||||||
static ref THREADED_REQUESTS: bool = env::var("THREADED_REQUESTS")
|
static ref CREATE_MODE: CreateMode = match env::var("CREATE_MODE")
|
||||||
.expect("THREADED_REQUESTS not set")
|
.expect("CREATE_MODE not set")
|
||||||
.parse()
|
.as_str()
|
||||||
.expect("Failed to parse THREADED_REQUESTS");
|
{
|
||||||
static ref WIKIDATA_BULK_INSERT: bool = env::var("WIKIDATA_BULK_INSERT")
|
"Single" => CreateMode::Single,
|
||||||
.expect("WIKIDATA_BULK_INSERT not set")
|
"ThreadedSingle" => CreateMode::ThreadedSingle,
|
||||||
.parse()
|
"ThreadedBulk" => CreateMode::ThreadedBulk,
|
||||||
.expect("Failed to parse WIKIDATA_BULK_INSERT");
|
_ => panic!("Unknown CREATE_MODE"),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub enum CreateMode {
|
||||||
|
Single,
|
||||||
|
ThreadedSingle,
|
||||||
|
ThreadedBulk,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
|
@ -29,51 +37,55 @@ async fn main() -> Result<(), Error> {
|
||||||
let db = create_db_ws().await?;
|
let db = create_db_ws().await?;
|
||||||
let reader = File_Format::new(&WIKIDATA_FILE_FORMAT).reader(&WIKIDATA_FILE_NAME)?;
|
let reader = File_Format::new(&WIKIDATA_FILE_FORMAT).reader(&WIKIDATA_FILE_NAME)?;
|
||||||
|
|
||||||
if !*THREADED_REQUESTS {
|
match *CREATE_MODE {
|
||||||
let mut counter = 0;
|
CreateMode::Single => {
|
||||||
for line in reader.lines() {
|
let mut counter = 0;
|
||||||
let mut retries = 0;
|
for line in reader.lines() {
|
||||||
let line = line?;
|
let mut retries = 0;
|
||||||
|
let line = line?;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
if create_db_entity(&db, &line).await.is_ok() {
|
if create_db_entity(&db, &line).await.is_ok() {
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
if retries >= 60 * 10 {
|
||||||
|
panic!("Failed to create entities, too many retries");
|
||||||
|
}
|
||||||
|
retries += 1;
|
||||||
|
sleep(Duration::from_secs(1)).await;
|
||||||
|
if db.use_ns("wikidata").use_db("wikidata").await.is_err() {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
if retries >= 60 * 10 {
|
|
||||||
panic!("Failed to create entities, too many retries");
|
|
||||||
}
|
|
||||||
retries += 1;
|
|
||||||
sleep(Duration::from_secs(1)).await;
|
|
||||||
if db.use_ns("wikidata").use_db("wikidata").await.is_err() {
|
|
||||||
continue;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
counter += 1;
|
counter += 1;
|
||||||
if counter % 100 == 0 {
|
if counter % 100 == 0 {
|
||||||
pb.inc(100);
|
pb.inc(100);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if *WIKIDATA_BULK_INSERT {
|
CreateMode::ThreadedSingle => {
|
||||||
create_db_entities_threaded(
|
create_db_entities_threaded(
|
||||||
None::<Surreal<Client>>,
|
None::<Surreal<Client>>,
|
||||||
reader,
|
reader,
|
||||||
Some(pb.clone()),
|
Some(pb.clone()),
|
||||||
2500,
|
2_500,
|
||||||
100,
|
100,
|
||||||
CreateVersion::Bulk,
|
CreateVersion::Single,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
} else {
|
}
|
||||||
create_db_entities_threaded(
|
CreateMode::ThreadedBulk => {
|
||||||
None::<Surreal<Client>>,
|
create_db_entities_threaded(
|
||||||
reader,
|
None::<Surreal<Client>>,
|
||||||
Some(pb.clone()),
|
reader,
|
||||||
2500,
|
Some(pb.clone()),
|
||||||
100,
|
500,
|
||||||
CreateVersion::Single,
|
1000,
|
||||||
)
|
CreateVersion::Bulk,
|
||||||
.await?;
|
)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pb.finish();
|
pb.finish();
|
||||||
|
|
|
@ -80,7 +80,7 @@ pub async fn create_db_entity(db: &Surreal<impl Connection>, line: &str) -> Resu
|
||||||
|
|
||||||
pub async fn create_db_entities(
|
pub async fn create_db_entities(
|
||||||
db: &Surreal<impl Connection>,
|
db: &Surreal<impl Connection>,
|
||||||
lines: &Vec<String>,
|
lines: &[String],
|
||||||
pb: &Option<ProgressBar>,
|
pb: &Option<ProgressBar>,
|
||||||
) -> Result<(), Error> {
|
) -> Result<(), Error> {
|
||||||
let mut counter = 0;
|
let mut counter = 0;
|
||||||
|
@ -168,7 +168,7 @@ impl CreateVersion {
|
||||||
pub async fn run(
|
pub async fn run(
|
||||||
self,
|
self,
|
||||||
db: &Surreal<impl Connection>,
|
db: &Surreal<impl Connection>,
|
||||||
chunk: &Vec<String>,
|
chunk: &[String],
|
||||||
pb: &Option<ProgressBar>,
|
pb: &Option<ProgressBar>,
|
||||||
batch_size: usize,
|
batch_size: usize,
|
||||||
) -> bool {
|
) -> bool {
|
||||||
|
@ -234,7 +234,7 @@ pub async fn create_db_entities_threaded(
|
||||||
panic!("Failed to create entities, too many retries");
|
panic!("Failed to create entities, too many retries");
|
||||||
}
|
}
|
||||||
retries += 1;
|
retries += 1;
|
||||||
sleep(Duration::from_millis(100)).await;
|
sleep(Duration::from_millis(250)).await;
|
||||||
}
|
}
|
||||||
}));
|
}));
|
||||||
chunk_counter += 1;
|
chunk_counter += 1;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue