mirror of
https://github.com/NexVeridian/wikidata-to-surrealdb.git
synced 2025-09-02 09:59:13 +00:00
more error handling and overwrite option
This commit is contained in:
parent
e37d413372
commit
8905c88819
8 changed files with 76 additions and 23 deletions
|
@ -22,9 +22,7 @@ Run tests with `cargo t`
|
||||||
Remove the cargo cache for buildkit with `docker builder prune --filter type=exec.cachemount`
|
Remove the cargo cache for buildkit with `docker builder prune --filter type=exec.cachemount`
|
||||||
|
|
||||||
# License
|
# License
|
||||||
All code in this repository is dual-licensed under either [License-MIT](./LICENSE-MIT) or [LICENSE-APACHE](./LICENSE-Apache) at your option. This means you can select the license you prefer.
|
All code in this repository is dual-licensed under either [License-MIT](./LICENSE-MIT) or [LICENSE-APACHE](./LICENSE-Apache) at your option. This means you can select the license you prefer. [Why dual license](https://github.com/bevyengine/bevy/issues/2373).
|
||||||
|
|
||||||
[Why dual license](https://github.com/bevyengine/bevy/issues/2373)
|
|
||||||
|
|
||||||
# Your contributions
|
# Your contributions
|
||||||
Any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions.
|
Any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions.
|
||||||
|
|
|
@ -9,7 +9,7 @@ anyhow = "1.0"
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
surrealdb = "1.0"
|
surrealdb = "1.0"
|
||||||
tokio = "1.35"
|
tokio = { version = "1.35", features = ["time"] }
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
wikidata = "0.3.1"
|
wikidata = "0.3.1"
|
||||||
bzip2 = { version = "0.4", features = ["tokio"] }
|
bzip2 = { version = "0.4", features = ["tokio"] }
|
||||||
|
|
|
@ -16,7 +16,7 @@ https://www.wikidata.org/wiki/Special:EntityData/P527.json
|
||||||
```
|
```
|
||||||
|
|
||||||
# Install
|
# Install
|
||||||
Copy docker-compose.yml
|
Copy [docker-compose.yml](./docker-compose.yml)
|
||||||
|
|
||||||
Create data folder next to docker-compose.yml and .env, place data inside, and set the data type in .env
|
Create data folder next to docker-compose.yml and .env, place data inside, and set the data type in .env
|
||||||
```
|
```
|
||||||
|
@ -40,6 +40,8 @@ FILE_NAME=data/latest-all.json.bz2
|
||||||
# If not using docker file for Wikidata to SurrealDB, use 0.0.0.0:8000
|
# If not using docker file for Wikidata to SurrealDB, use 0.0.0.0:8000
|
||||||
WIKIDATA_DB_PORT=surrealdb:8000
|
WIKIDATA_DB_PORT=surrealdb:8000
|
||||||
THREADED_REQUESTS=true
|
THREADED_REQUESTS=true
|
||||||
|
# true=overwrite existing data, false=skip if already exists
|
||||||
|
OVERWRITE_DB=false
|
||||||
```
|
```
|
||||||
|
|
||||||
## View Progress
|
## View Progress
|
||||||
|
|
|
@ -13,6 +13,13 @@ services:
|
||||||
- --pass
|
- --pass
|
||||||
- $DB_PASSWORD
|
- $DB_PASSWORD
|
||||||
- file:/data/surrealdb
|
- file:/data/surrealdb
|
||||||
|
restart: always
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 8GB
|
||||||
|
reservations:
|
||||||
|
cpus: '0.5'
|
||||||
ports:
|
ports:
|
||||||
- 8000:8000
|
- 8000:8000
|
||||||
volumes:
|
volumes:
|
||||||
|
|
|
@ -13,6 +13,13 @@ services:
|
||||||
- --pass
|
- --pass
|
||||||
- $DB_PASSWORD
|
- $DB_PASSWORD
|
||||||
- file:/data/surrealdb
|
- file:/data/surrealdb
|
||||||
|
restart: always
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 8GB
|
||||||
|
reservations:
|
||||||
|
cpus: '0.5'
|
||||||
ports:
|
ports:
|
||||||
- 8000:8000
|
- 8000:8000
|
||||||
volumes:
|
volumes:
|
||||||
|
|
25
src/main.rs
25
src/main.rs
|
@ -1,8 +1,9 @@
|
||||||
use anyhow::{Error, Ok, Result};
|
use anyhow::{Error, Ok, Result};
|
||||||
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use std::{env, fmt::Write, io::BufRead, thread, time::Duration};
|
use std::{env, fmt::Write, io::BufRead};
|
||||||
use surrealdb::{engine::remote::ws::Ws, opt::auth::Root, Surreal};
|
use surrealdb::{engine::remote::ws::Ws, opt::auth::Root, Surreal};
|
||||||
|
use tokio::time::{sleep, Duration};
|
||||||
mod utils;
|
mod utils;
|
||||||
use utils::*;
|
use utils::*;
|
||||||
|
|
||||||
|
@ -18,7 +19,7 @@ lazy_static! {
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<(), Error> {
|
async fn main() -> Result<(), Error> {
|
||||||
thread::sleep(Duration::from_secs(10));
|
sleep(Duration::from_secs(10)).await;
|
||||||
let total_size = 113_000_000;
|
let total_size = 113_000_000;
|
||||||
|
|
||||||
let pb = ProgressBar::new(total_size);
|
let pb = ProgressBar::new(total_size);
|
||||||
|
@ -48,14 +49,30 @@ async fn main() -> Result<(), Error> {
|
||||||
if !*THREADED_REQUESTS {
|
if !*THREADED_REQUESTS {
|
||||||
let mut counter = 0;
|
let mut counter = 0;
|
||||||
for line in reader.lines() {
|
for line in reader.lines() {
|
||||||
create_db_entity(&db, line?).await?;
|
let mut retries = 0;
|
||||||
|
let line = line?;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
if create_db_entity(&db, &line).await.is_ok() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if retries >= 60 * 10 {
|
||||||
|
panic!("Failed to create entities, too many retries");
|
||||||
|
}
|
||||||
|
retries += 1;
|
||||||
|
sleep(Duration::from_secs(1)).await;
|
||||||
|
if db.use_ns("wikidata").use_db("wikidata").await.is_err() {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
counter += 1;
|
counter += 1;
|
||||||
if counter % 100 == 0 {
|
if counter % 100 == 0 {
|
||||||
pb.inc(100);
|
pb.inc(100);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
create_db_entities_threaded(&db, reader, Some(pb.clone()), 1000, 100).await?;
|
create_db_entities_threaded(&db, reader, Some(pb.clone()), 2500, 100).await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
pb.finish();
|
pb.finish();
|
||||||
|
|
45
src/utils.rs
45
src/utils.rs
|
@ -2,17 +2,27 @@ use anyhow::{Error, Ok, Result};
|
||||||
use bzip2::read::MultiBzDecoder;
|
use bzip2::read::MultiBzDecoder;
|
||||||
use futures::future::join_all;
|
use futures::future::join_all;
|
||||||
use indicatif::ProgressBar;
|
use indicatif::ProgressBar;
|
||||||
|
use lazy_static::lazy_static;
|
||||||
use serde_json::{from_str, Value};
|
use serde_json::{from_str, Value};
|
||||||
use std::{
|
use std::{
|
||||||
|
env,
|
||||||
fs::File,
|
fs::File,
|
||||||
io::{BufRead, BufReader},
|
io::{BufRead, BufReader},
|
||||||
};
|
};
|
||||||
use surrealdb::{Connection, Surreal};
|
use surrealdb::{Connection, Surreal};
|
||||||
|
use tokio::time::{sleep, Duration};
|
||||||
use wikidata::Entity;
|
use wikidata::Entity;
|
||||||
|
|
||||||
mod tables;
|
mod tables;
|
||||||
use tables::*;
|
use tables::*;
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref OVERWRITE_DB: bool = env::var("OVERWRITE_DB")
|
||||||
|
.expect("OVERWRITE_DB not set")
|
||||||
|
.parse()
|
||||||
|
.expect("Failed to parse OVERWRITE_DB");
|
||||||
|
}
|
||||||
|
|
||||||
#[allow(non_camel_case_types)]
|
#[allow(non_camel_case_types)]
|
||||||
pub enum File_Format {
|
pub enum File_Format {
|
||||||
json,
|
json,
|
||||||
|
@ -35,7 +45,7 @@ impl File_Format {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn create_db_entity<C: Connection>(db: &Surreal<C>, line: String) -> Result<(), Error> {
|
pub async fn create_db_entity<C: Connection>(db: &Surreal<C>, line: &str) -> Result<(), Error> {
|
||||||
let line = line.trim().trim_end_matches(',').to_string();
|
let line = line.trim().trim_end_matches(',').to_string();
|
||||||
if line == "[" || line == "]" {
|
if line == "[" || line == "]" {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
|
@ -48,15 +58,13 @@ pub async fn create_db_entity<C: Connection>(db: &Surreal<C>, line: String) -> R
|
||||||
|
|
||||||
let id = data.id.clone().expect("No ID");
|
let id = data.id.clone().expect("No ID");
|
||||||
data.id = None;
|
data.id = None;
|
||||||
let _ = db.create::<Option<EntityMini>>(&id).await.is_err();
|
if db.create::<Option<EntityMini>>(&id).await.is_err() && *OVERWRITE_DB {
|
||||||
{
|
|
||||||
db.update::<Option<EntityMini>>(&id).content(data).await?;
|
db.update::<Option<EntityMini>>(&id).content(data).await?;
|
||||||
};
|
}
|
||||||
|
|
||||||
let id = claims.id.clone().expect("No ID");
|
let id = claims.id.clone().expect("No ID");
|
||||||
claims.id = None;
|
claims.id = None;
|
||||||
let _ = db.create::<Option<Claims>>(&id).await.is_err();
|
if db.create::<Option<Claims>>(&id).await.is_err() && *OVERWRITE_DB {
|
||||||
{
|
|
||||||
db.update::<Option<Claims>>(&id).content(claims).await?;
|
db.update::<Option<Claims>>(&id).content(claims).await?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -64,12 +72,12 @@ pub async fn create_db_entity<C: Connection>(db: &Surreal<C>, line: String) -> R
|
||||||
|
|
||||||
pub async fn create_db_entities<C: Connection>(
|
pub async fn create_db_entities<C: Connection>(
|
||||||
db: &Surreal<C>,
|
db: &Surreal<C>,
|
||||||
lines: Vec<String>,
|
lines: &Vec<String>,
|
||||||
pb: Option<ProgressBar>,
|
pb: &Option<ProgressBar>,
|
||||||
) -> Result<(), Error> {
|
) -> Result<(), Error> {
|
||||||
let mut counter = 0;
|
let mut counter = 0;
|
||||||
for line in lines {
|
for line in lines {
|
||||||
create_db_entity(db, line.to_string()).await?;
|
create_db_entity(db, line).await?;
|
||||||
counter += 1;
|
counter += 1;
|
||||||
if counter % 100 == 0 {
|
if counter % 100 == 0 {
|
||||||
if let Some(ref p) = pb {
|
if let Some(ref p) = pb {
|
||||||
|
@ -92,7 +100,7 @@ pub async fn create_db_entities_threaded<C: Connection>(
|
||||||
let mut chunk_counter = 0;
|
let mut chunk_counter = 0;
|
||||||
|
|
||||||
for line in reader.lines() {
|
for line in reader.lines() {
|
||||||
chunk.push(line.unwrap());
|
chunk.push(line?);
|
||||||
|
|
||||||
if chunk.len() >= batch_size {
|
if chunk.len() >= batch_size {
|
||||||
let db = db.clone();
|
let db = db.clone();
|
||||||
|
@ -100,7 +108,20 @@ pub async fn create_db_entities_threaded<C: Connection>(
|
||||||
let pb = pb.clone();
|
let pb = pb.clone();
|
||||||
|
|
||||||
futures.push(tokio::spawn(async move {
|
futures.push(tokio::spawn(async move {
|
||||||
create_db_entities(&db, lines, pb).await.unwrap();
|
let mut retries = 0;
|
||||||
|
loop {
|
||||||
|
if create_db_entities(&db, &lines, &pb).await.is_ok() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if retries >= 60 * 10 {
|
||||||
|
panic!("Failed to create entities, too many retries");
|
||||||
|
}
|
||||||
|
retries += 1;
|
||||||
|
sleep(Duration::from_secs(1)).await;
|
||||||
|
if db.use_ns("wikidata").use_db("wikidata").await.is_err() {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
}
|
||||||
}));
|
}));
|
||||||
chunk_counter += 1;
|
chunk_counter += 1;
|
||||||
chunk.clear();
|
chunk.clear();
|
||||||
|
@ -113,7 +134,7 @@ pub async fn create_db_entities_threaded<C: Connection>(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
create_db_entities(db, chunk, pb).await.unwrap();
|
create_db_entities(db, &chunk, &pb).await?;
|
||||||
join_all(futures).await;
|
join_all(futures).await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,6 +9,7 @@ use wikidata_to_surrealdb::utils::*;
|
||||||
|
|
||||||
async fn inti_db() -> Result<Surreal<Db>, Error> {
|
async fn inti_db() -> Result<Surreal<Db>, Error> {
|
||||||
env::set_var("WIKIDATA_LANG", "en");
|
env::set_var("WIKIDATA_LANG", "en");
|
||||||
|
env::set_var("OVERWRITE_DB", "true");
|
||||||
|
|
||||||
let db = Surreal::new::<Mem>(()).await?;
|
let db = Surreal::new::<Mem>(()).await?;
|
||||||
db.use_ns("wikidata").use_db("wikidata").await?;
|
db.use_ns("wikidata").use_db("wikidata").await?;
|
||||||
|
@ -40,7 +41,7 @@ async fn entity() {
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
for line in reader.lines() {
|
for line in reader.lines() {
|
||||||
create_db_entity(&db, line.unwrap()).await.unwrap();
|
create_db_entity(&db, &line.unwrap()).await.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
assert_eq!(51.0, entity_query(&db).await.unwrap().unwrap())
|
assert_eq!(51.0, entity_query(&db).await.unwrap().unwrap())
|
||||||
|
@ -82,7 +83,7 @@ async fn property() {
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
for line in reader.lines() {
|
for line in reader.lines() {
|
||||||
create_db_entity(&db, line.unwrap()).await.unwrap();
|
create_db_entity(&db, &line.unwrap()).await.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
assert_eq!(2.0, property_query(&db).await.unwrap().unwrap())
|
assert_eq!(2.0, property_query(&db).await.unwrap().unwrap())
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue