From ee15b46ae27863e03ea887454b8b5fbc86f63af4 Mon Sep 17 00:00:00 2001 From: NexVeridian Date: Sat, 16 Dec 2023 00:35:55 -0800 Subject: [PATCH] progress bar, fix docker --- .dockerignore | 6 ++++-- Cargo.toml | 1 + DockerFile | 4 ++-- README.md | 29 ++++++++++++++++++++++++++--- docker-compose.dev.yml | 7 +------ docker-compose.yml | 7 +------ src/main.rs | 38 ++++++++++++++++++++++++++++++++++---- src/utils.rs | 4 ++-- 8 files changed, 71 insertions(+), 25 deletions(-) diff --git a/.dockerignore b/.dockerignore index 9346958..92dbecb 100644 --- a/.dockerignore +++ b/.dockerignore @@ -4,8 +4,10 @@ /data /target .dockerignore +.env +.gitignore Cargo.lock -Dockerfile -docker-compose.yml docker-compose.dev.yml +docker-compose.yml +dockerfile *.md diff --git a/Cargo.toml b/Cargo.toml index 8eef51d..49536bb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,3 +13,4 @@ tokio = "1.35" wikidata = "0.3.1" bzip2 = { version = "0.4", features = ["tokio"] } lazy_static = "1.4" +indicatif = "0.17" diff --git a/DockerFile b/DockerFile index 361fe1c..a67c181 100644 --- a/DockerFile +++ b/DockerFile @@ -20,8 +20,8 @@ RUN --mount=type=cache,target=/usr/local/cargo,from=rust,source=/usr/local/cargo FROM alpine:latest AS main -WORKDIR /wikidata-to-surrealdb +# WORKDIR /wikidata-to-surrealdb COPY --from=builder wikidata-to-surrealdb/wikidata-to-surrealdb . -CMD ["./wikidata-to-surrealdb"] \ No newline at end of file +CMD ["./wikidata-to-surrealdb"] diff --git a/README.md b/README.md index f7b869e..d55973c 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,51 @@ +# Wikidata to SurrealDB A tool for converting Wikidata dumps to a [SurrealDB](https://surrealdb.com/) database. Either From a bz2 or json file format. # Getting The Data https://www.wikidata.org/wiki/Wikidata:Data_access -## From bz2 file (Recommended) ~80GB +## From bz2 file ~80GB ### Dump: [Docs](https://www.wikidata.org/wiki/Wikidata:Database_download) ### [Download - latest-all.json.bz2](https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2) -## From json file +## From json file ### Linked Data Interface: [Docs](https://www.wikidata.org/wiki/Wikidata:Data_access#Linked_Data_Interface_(URI)) ``` https://www.wikidata.org/wiki/Special:EntityData/Q60746544.json https://www.wikidata.org/wiki/Special:EntityData/P527.json ``` -# Example .env +# Install +Copy docker-compose.yml + +Create data folder next to docker-compose.yml and .env, place data inside, and set the data type in .env +``` +├── data +│ ├── Entity.json +│ ├── latest-all.json.bz2 +│ └── surrealdb +├── docker-compose.yml +└── .env +``` + +`docker compose up --pull always` + +## Example .env ``` DB_USER=root DB_PASSWORD=root WIKIDATA_LANG=en FILE_FORMAT=bz2 FILE_NAME=data/latest-all.json.bz2 +# If not using docker file for Wikidata to SurrealDB, use 0.0.0.0:8000 +WIKIDATA_DB_PORT=surrealdb:8000 ``` +## View Progress +`docker attach wikidata-to-surrealdb` + +# [Dev Install](./CONTRIBUTING.md#dev-install) + # How to Query ## See [Useful queries.md](./Useful%20queries.md) diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 8deae62..534ad9f 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -26,13 +26,8 @@ services: context: . env_file: - .env - environment: - - DB_USER=$DB_USER - - DB_PASSWORD=$DB_PASSWORD - - WIKIDATA_LANG=$WIKIDATA_LANG - - FILE_FORMAT=$FILE_FORMAT - - FILE_NAME=$FILE_NAME restart: no + tty: true depends_on: - surrealdb volumes: diff --git a/docker-compose.yml b/docker-compose.yml index b31b55b..b955f49 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -25,13 +25,8 @@ services: image: ghcr.io/nexveridian/ark-invest-api-rust-data:latest env_file: - .env - environment: - - DB_USER=$DB_USER - - DB_PASSWORD=$DB_PASSWORD - - WIKIDATA_LANG=$WIKIDATA_LANG - - FILE_FORMAT=$FILE_FORMAT - - FILE_NAME=$FILE_NAME restart: no + tty: true depends_on: - surrealdb volumes: diff --git a/src/main.rs b/src/main.rs index 0393670..6d17533 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,11 +1,15 @@ use anyhow::{Error, Ok, Result}; use bzip2::read::MultiBzDecoder; +use indicatif::{ProgressBar, ProgressState, ProgressStyle}; use lazy_static::lazy_static; use serde_json::{from_str, Value}; use std::{ env, + fmt::Write, fs::File, io::{BufRead, BufReader}, + thread, + time::Duration, }; use surrealdb::{engine::remote::ws::Ws, opt::auth::Root, Surreal}; use wikidata::Entity; @@ -17,8 +21,9 @@ lazy_static! { #[derive(Debug)] static ref DB_USER: String = env::var("DB_USER").expect("DB_USER not set"); static ref DB_PASSWORD: String = env::var("DB_PASSWORD").expect("DB_PASSWORD not set"); - static ref FILE_FORMAT: String = env::var("FILE_FORMAT").expect("FILE_FORMAT not set"); - static ref FILE_NAME: String = env::var("FILE_NAME").expect("FILE_NAME not set"); + static ref WIKIDATA_FILE_FORMAT: String = env::var("WIKIDATA_FILE_FORMAT").expect("FILE_FORMAT not set"); + static ref WIKIDATA_FILE_NAME: String = env::var("WIKIDATA_FILE_NAME").expect("FILE_NAME not set"); + static ref WIKIDATA_DB_PORT: String = env::var("WIKIDATA_DB_PORT").expect("WIKIDATA_DB_PORT not set"); } #[allow(non_camel_case_types)] @@ -45,7 +50,26 @@ impl File_Format { #[tokio::main] async fn main() -> Result<(), Error> { - let db = Surreal::new::("0.0.0.0:8000").await?; + thread::sleep(Duration::from_secs(10)); + + let mut compleated = 0; + let total_size = 113_000_000; + + let pb = ProgressBar::new(total_size); + pb.set_style( + ProgressStyle::with_template( + "[{elapsed_precise}] [{wide_bar:.cyan/blue}] {human_pos}/{human_len} {percent} ETA:{eta}", + )? + .with_key("eta", |state: &ProgressState, w: &mut dyn Write| { + let sec = state.eta().as_secs(); + let min = (sec / 60) % 60; + let hr = (sec / 60) / 60; + write!(w, "{}:{:02}:{:02}", hr, min, sec % 60).unwrap() + }), + ); + + let db = Surreal::new::(WIKIDATA_DB_PORT.as_str()).await?; + db.signin(Root { username: &DB_USER, password: &DB_PASSWORD, @@ -53,7 +77,7 @@ async fn main() -> Result<(), Error> { .await?; db.use_ns("wikidata").use_db("wikidata").await?; - let reader = File_Format::new(&FILE_FORMAT).reader(&FILE_NAME)?; + let reader = File_Format::new(&WIKIDATA_FILE_FORMAT).reader(&WIKIDATA_FILE_NAME)?; for line in reader.lines() { let line = line?.trim().trim_end_matches(',').to_string(); @@ -75,7 +99,13 @@ async fn main() -> Result<(), Error> { claims.id = None; let _: Option = db.delete(&id).await?; let _: Option = db.create(&id).content(claims).await?; + + compleated += 1; + if compleated % 1000 == 0 { + pb.set_position(compleated); + } } + pb.finish_with_message("Done parsing Wikidata"); Ok(()) } diff --git a/src/utils.rs b/src/utils.rs index 87b8853..bf3155f 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -125,8 +125,8 @@ fn get_name(entity: &Entity) -> String { entity .labels .get(&Lang(WIKIDATA_LANG.to_string())) - .expect("No label found") - .to_string() + .map(|label| label.to_string()) + .unwrap_or_default() } fn get_description(entity: &Entity) -> Option {