mirror of
https://github.com/NexVeridian/wikidata-to-surrealdb.git
synced 2025-09-02 01:49:13 +00:00
refactor init_db and init_progress_bar
This commit is contained in:
parent
731df97cd2
commit
2ded1d5b1b
8 changed files with 103 additions and 134 deletions
87
Cargo.lock
generated
87
Cargo.lock
generated
|
@ -434,9 +434,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "bytemuck"
|
||||
version = "1.17.0"
|
||||
version = "1.17.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6fd4c6dcc3b0aea2f5c0b4b82c2b15fe39ddbc76041a310848f4706edf76bb31"
|
||||
checksum = "773d90827bc3feecfb67fab12e24de0749aad83c74b9504ecde46237b5cd24e2"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
|
@ -500,9 +500,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.1.14"
|
||||
version = "1.1.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50d2eb3cd3d1bf4529e31c215ee6f93ec5a3d536d9f578f93d9d33ee19562932"
|
||||
checksum = "57b6a275aa2903740dc87da01c62040406b8812552e97129a63ea8850a17c6e6"
|
||||
dependencies = [
|
||||
"shlex",
|
||||
]
|
||||
|
@ -2385,9 +2385,9 @@ checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
|
|||
|
||||
[[package]]
|
||||
name = "proc-macro-crate"
|
||||
version = "3.1.0"
|
||||
version = "3.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d37c51ca738a55da99dc0c4a34860fd675453b8b36209178c2249bb13651284"
|
||||
checksum = "8ecf48c7ca261d60b74ab1a7b20da18bede46776b2e55535cb958eb595c5fa7b"
|
||||
dependencies = [
|
||||
"toml_edit",
|
||||
]
|
||||
|
@ -2994,9 +2994,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.38.34"
|
||||
version = "0.38.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
|
||||
checksum = "a85d50532239da68e9addb745ba38ff4612a242c1c7ceea689c4bc7c2f43c36f"
|
||||
dependencies = [
|
||||
"bitflags 2.6.0",
|
||||
"errno",
|
||||
|
@ -3041,9 +3041,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "rustls-webpki"
|
||||
version = "0.102.6"
|
||||
version = "0.102.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e6b52d4fda176fd835fdc55a835d4a89b8499cad995885a21149d5ad62f852e"
|
||||
checksum = "84678086bd54edf2b415183ed7a94d0efb049f1b646a33e22a36f3794be6ae56"
|
||||
dependencies = [
|
||||
"ring",
|
||||
"rustls-pki-types",
|
||||
|
@ -3346,15 +3346,15 @@ checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
|
|||
|
||||
[[package]]
|
||||
name = "stacker"
|
||||
version = "0.1.16"
|
||||
version = "0.1.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95a5daa25ea337c85ed954c0496e3bdd2c7308cc3b24cf7b50d04876654c579f"
|
||||
checksum = "799c883d55abdb5e98af1a7b3f23b9b6de8ecada0ecac058672d7635eb48ca7b"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"psm",
|
||||
"windows-sys 0.36.1",
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -3831,9 +3831,9 @@ checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41"
|
|||
|
||||
[[package]]
|
||||
name = "toml_edit"
|
||||
version = "0.21.1"
|
||||
version = "0.22.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a8534fd7f78b5405e860340ad6575217ce99f38d4d5c8f2442cb5ecb50090e1"
|
||||
checksum = "583c44c02ad26b0c3f3066fe629275e50627026c51ac2e595cca4c230ce1ce1d"
|
||||
dependencies = [
|
||||
"indexmap 2.4.0",
|
||||
"toml_datetime",
|
||||
|
@ -4307,19 +4307,6 @@ dependencies = [
|
|||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.36.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2"
|
||||
dependencies = [
|
||||
"windows_aarch64_msvc 0.36.1",
|
||||
"windows_i686_gnu 0.36.1",
|
||||
"windows_i686_msvc 0.36.1",
|
||||
"windows_x86_64_gnu 0.36.1",
|
||||
"windows_x86_64_msvc 0.36.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.52.0"
|
||||
|
@ -4345,13 +4332,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm",
|
||||
"windows_aarch64_msvc 0.52.6",
|
||||
"windows_i686_gnu 0.52.6",
|
||||
"windows_aarch64_msvc",
|
||||
"windows_i686_gnu",
|
||||
"windows_i686_gnullvm",
|
||||
"windows_i686_msvc 0.52.6",
|
||||
"windows_x86_64_gnu 0.52.6",
|
||||
"windows_i686_msvc",
|
||||
"windows_x86_64_gnu",
|
||||
"windows_x86_64_gnullvm",
|
||||
"windows_x86_64_msvc 0.52.6",
|
||||
"windows_x86_64_msvc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -4360,24 +4347,12 @@ version = "0.52.6"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.36.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.36.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.52.6"
|
||||
|
@ -4390,24 +4365,12 @@ version = "0.52.6"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.36.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.36.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.52.6"
|
||||
|
@ -4420,12 +4383,6 @@ version = "0.52.6"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.36.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.52.6"
|
||||
|
@ -4434,9 +4391,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
|||
|
||||
[[package]]
|
||||
name = "winnow"
|
||||
version = "0.5.40"
|
||||
version = "0.6.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876"
|
||||
checksum = "68a9bda4691f099d435ad181000724da8e5899daa10713c2d432552b9ccd3a6f"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
|
|
@ -11,7 +11,7 @@ async fn inti_db() -> Result<Surreal<Db>, Error> {
|
|||
env::set_var("WIKIDATA_LANG", "en");
|
||||
env::set_var("OVERWRITE_DB", "true");
|
||||
|
||||
let db = create_db_mem().await?;
|
||||
let db = init_db::create_db_mem().await?;
|
||||
|
||||
Ok(db)
|
||||
}
|
||||
|
|
18
flake.lock
generated
18
flake.lock
generated
|
@ -3,11 +3,11 @@
|
|||
"advisory-db": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1724510776,
|
||||
"narHash": "sha256-K9CHOXzHPfNjZsz3dC9Vhdryz70dyaDTsCjFJHB19xA=",
|
||||
"lastModified": 1724775741,
|
||||
"narHash": "sha256-xuj7Ye3Y2EgunLiEEV5zYxUQuLTURV5mgbXDB1fA7h8=",
|
||||
"owner": "rustsec",
|
||||
"repo": "advisory-db",
|
||||
"rev": "dd0703e582ab7edc2637bc3385d540c3dbffa0db",
|
||||
"rev": "fe4d5979b34444815287d61bd2a4e193cebbc7a6",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
@ -44,11 +44,11 @@
|
|||
"rust-analyzer-src": []
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1724653830,
|
||||
"narHash": "sha256-88f0KK8h6tGIP4Na5RJDKs0S+7WsGGaCGNkLj/bPV3g=",
|
||||
"lastModified": 1724740262,
|
||||
"narHash": "sha256-cpFasbzOTlwLi4fNas6hDznVUdCJn/lMLxi7MAMG6hg=",
|
||||
"owner": "nix-community",
|
||||
"repo": "fenix",
|
||||
"rev": "9ecf5e7d800ace001320da8acadd4a3deb872a83",
|
||||
"rev": "703efdd9b5c6a7d5824afa348a24fbbf8ff226be",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
@ -77,11 +77,11 @@
|
|||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1724395761,
|
||||
"narHash": "sha256-zRkDV/nbrnp3Y8oCADf5ETl1sDrdmAW6/bBVJ8EbIdQ=",
|
||||
"lastModified": 1724748588,
|
||||
"narHash": "sha256-NlpGA4+AIf1dKNq76ps90rxowlFXUsV9x7vK/mN37JM=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "ae815cee91b417be55d43781eb4b73ae1ecc396c",
|
||||
"rev": "a6292e34000dc93d43bccf78338770c1c5ec8a99",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
|
|
@ -3,6 +3,7 @@ use lazy_static::lazy_static;
|
|||
use std::{env, io::BufRead};
|
||||
use surrealdb::{engine::remote::ws::Client, Surreal};
|
||||
use tokio::time::{sleep, Duration};
|
||||
|
||||
mod utils;
|
||||
use utils::*;
|
||||
|
||||
|
@ -35,9 +36,9 @@ pub enum CreateMode {
|
|||
#[tokio::main]
|
||||
async fn main() -> Result<(), Error> {
|
||||
sleep(Duration::from_secs(10)).await;
|
||||
let pb = create_pb().await;
|
||||
let pb = init_progress_bar::create_pb().await;
|
||||
|
||||
let db = create_db_ws().await?;
|
||||
let db = init_db::create_db_ws().await?;
|
||||
let reader = File_Format::new(&WIKIDATA_FILE_FORMAT).reader(&WIKIDATA_FILE_NAME)?;
|
||||
|
||||
match *CREATE_MODE {
|
||||
|
|
63
src/utils.rs
63
src/utils.rs
|
@ -2,7 +2,7 @@ use anyhow::{Error, Result};
|
|||
use bzip2::read::MultiBzDecoder;
|
||||
use core::panic;
|
||||
use futures::future::join_all;
|
||||
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
||||
use indicatif::ProgressBar;
|
||||
use lazy_static::lazy_static;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde_json::{from_str, Value};
|
||||
|
@ -11,17 +11,12 @@ use std::{
|
|||
fs::File,
|
||||
io::{BufRead, BufReader},
|
||||
};
|
||||
use surrealdb::{
|
||||
engine::{
|
||||
local::{Db, Mem},
|
||||
remote::ws::{Client, Ws},
|
||||
},
|
||||
opt::auth::Root,
|
||||
Connection, Surreal,
|
||||
};
|
||||
use surrealdb::{Connection, Surreal};
|
||||
use tokio::time::{sleep, Duration};
|
||||
use wikidata::Entity;
|
||||
|
||||
pub mod init_db;
|
||||
pub mod init_progress_bar;
|
||||
mod tables;
|
||||
use tables::*;
|
||||
|
||||
|
@ -30,10 +25,6 @@ lazy_static! {
|
|||
.expect("OVERWRITE_DB not set")
|
||||
.parse()
|
||||
.expect("Failed to parse OVERWRITE_DB");
|
||||
static ref DB_USER: String = env::var("DB_USER").expect("DB_USER not set");
|
||||
static ref DB_PASSWORD: String = env::var("DB_PASSWORD").expect("DB_PASSWORD not set");
|
||||
static ref WIKIDATA_DB_PORT: String =
|
||||
env::var("WIKIDATA_DB_PORT").expect("WIKIDATA_DB_PORT not set");
|
||||
static ref FILTER_PATH: String =
|
||||
env::var("FILTER_PATH").unwrap_or("../filter.surql".to_string());
|
||||
}
|
||||
|
@ -172,7 +163,7 @@ pub async fn create_db_entities_bulk_filter(
|
|||
pb: &Option<ProgressBar>,
|
||||
batch_size: usize,
|
||||
) -> Result<(), Error> {
|
||||
let db_mem = create_db_mem().await?;
|
||||
let db_mem = init_db::create_db_mem().await?;
|
||||
create_db_entities_bulk(&db_mem, lines, &None, batch_size).await?;
|
||||
|
||||
let filter = tokio::fs::read_to_string(&*FILTER_PATH).await?;
|
||||
|
@ -263,7 +254,7 @@ pub async fn create_db_entities_threaded(
|
|||
};
|
||||
}
|
||||
None => {
|
||||
let db = if let Ok(db) = create_db_ws().await {
|
||||
let db = if let Ok(db) = init_db::create_db_ws().await {
|
||||
db
|
||||
} else {
|
||||
continue;
|
||||
|
@ -300,49 +291,9 @@ pub async fn create_db_entities_threaded(
|
|||
create_db_entities(&db, &chunk, &pb).await?;
|
||||
}
|
||||
None => {
|
||||
create_db_entities(&create_db_ws().await?, &chunk, &pb).await?;
|
||||
create_db_entities(&init_db::create_db_ws().await?, &chunk, &pb).await?;
|
||||
}
|
||||
}
|
||||
join_all(futures).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn create_db_ws() -> Result<Surreal<Client>, Error> {
|
||||
let db = Surreal::new::<Ws>(WIKIDATA_DB_PORT.as_str()).await?;
|
||||
|
||||
db.signin(Root {
|
||||
username: &DB_USER,
|
||||
password: &DB_PASSWORD,
|
||||
})
|
||||
.await?;
|
||||
db.use_ns("wikidata").use_db("wikidata").await?;
|
||||
|
||||
Ok(db)
|
||||
}
|
||||
|
||||
pub async fn create_db_mem() -> Result<Surreal<Db>, Error> {
|
||||
let db = Surreal::new::<Mem>(()).await?;
|
||||
db.use_ns("wikidata").use_db("wikidata").await?;
|
||||
Ok(db)
|
||||
}
|
||||
|
||||
pub async fn create_pb() -> ProgressBar {
|
||||
let total_size = 110_000_000;
|
||||
let pb = ProgressBar::new(total_size);
|
||||
pb.set_style(
|
||||
ProgressStyle::with_template(
|
||||
"[{elapsed_precise}] [{wide_bar:.cyan/blue}] {human_pos}/{human_len} ETA:[{eta}]",
|
||||
)
|
||||
.unwrap()
|
||||
.with_key(
|
||||
"eta",
|
||||
|state: &ProgressState, w: &mut dyn std::fmt::Write| {
|
||||
let sec = state.eta().as_secs();
|
||||
let min = (sec / 60) % 60;
|
||||
let hr = (sec / 60) / 60;
|
||||
write!(w, "{}:{:02}:{:02}", hr, min, sec % 60).unwrap()
|
||||
},
|
||||
),
|
||||
);
|
||||
pb
|
||||
}
|
||||
|
|
38
src/utils/init_db.rs
Normal file
38
src/utils/init_db.rs
Normal file
|
@ -0,0 +1,38 @@
|
|||
use anyhow::Error;
|
||||
use anyhow::Result;
|
||||
use lazy_static::lazy_static;
|
||||
use std::env;
|
||||
use surrealdb::{
|
||||
engine::{
|
||||
local::{Db, Mem},
|
||||
remote::ws::{Client, Ws},
|
||||
},
|
||||
opt::auth::Root,
|
||||
Surreal,
|
||||
};
|
||||
|
||||
lazy_static! {
|
||||
static ref DB_USER: String = env::var("DB_USER").expect("DB_USER not set");
|
||||
static ref DB_PASSWORD: String = env::var("DB_PASSWORD").expect("DB_PASSWORD not set");
|
||||
static ref WIKIDATA_DB_PORT: String =
|
||||
env::var("WIKIDATA_DB_PORT").expect("WIKIDATA_DB_PORT not set");
|
||||
}
|
||||
|
||||
pub async fn create_db_ws() -> Result<Surreal<Client>, Error> {
|
||||
let db = Surreal::new::<Ws>(WIKIDATA_DB_PORT.as_str()).await?;
|
||||
|
||||
db.signin(Root {
|
||||
username: &DB_USER,
|
||||
password: &DB_PASSWORD,
|
||||
})
|
||||
.await?;
|
||||
db.use_ns("wikidata").use_db("wikidata").await?;
|
||||
|
||||
Ok(db)
|
||||
}
|
||||
|
||||
pub async fn create_db_mem() -> Result<Surreal<Db>, Error> {
|
||||
let db = Surreal::new::<Mem>(()).await?;
|
||||
db.use_ns("wikidata").use_db("wikidata").await?;
|
||||
Ok(db)
|
||||
}
|
22
src/utils/init_progress_bar.rs
Normal file
22
src/utils/init_progress_bar.rs
Normal file
|
@ -0,0 +1,22 @@
|
|||
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
||||
|
||||
pub async fn create_pb() -> ProgressBar {
|
||||
let total_size = 110_000_000;
|
||||
let pb = ProgressBar::new(total_size);
|
||||
pb.set_style(
|
||||
ProgressStyle::with_template(
|
||||
"[{elapsed_precise}] [{wide_bar:.cyan/blue}] {human_pos}/{human_len} ETA:[{eta}]",
|
||||
)
|
||||
.unwrap()
|
||||
.with_key(
|
||||
"eta",
|
||||
|state: &ProgressState, w: &mut dyn std::fmt::Write| {
|
||||
let sec = state.eta().as_secs();
|
||||
let min = (sec / 60) % 60;
|
||||
let hr = (sec / 60) / 60;
|
||||
write!(w, "{}:{:02}:{:02}", hr, min, sec % 60).unwrap()
|
||||
},
|
||||
),
|
||||
);
|
||||
pb
|
||||
}
|
|
@ -9,7 +9,7 @@ async fn inti_db() -> Result<Surreal<Db>, Error> {
|
|||
env::set_var("WIKIDATA_LANG", "en");
|
||||
env::set_var("OVERWRITE_DB", "true");
|
||||
|
||||
let db = create_db_mem().await?;
|
||||
let db = init_db::create_db_mem().await?;
|
||||
|
||||
Ok(db)
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue