progress bar, fix docker

This commit is contained in:
Elijah McMorris 2023-12-16 00:35:55 -08:00
parent 8b3311ff21
commit ee15b46ae2
Signed by: NexVeridian
SSH key fingerprint: SHA256:bsA1SKZxuEcEVHAy3gY1HUeM5ykRJl0U0kQHQn0hMg8
8 changed files with 71 additions and 25 deletions

View file

@ -4,8 +4,10 @@
/data /data
/target /target
.dockerignore .dockerignore
.env
.gitignore
Cargo.lock Cargo.lock
Dockerfile
docker-compose.yml
docker-compose.dev.yml docker-compose.dev.yml
docker-compose.yml
dockerfile
*.md *.md

View file

@ -13,3 +13,4 @@ tokio = "1.35"
wikidata = "0.3.1" wikidata = "0.3.1"
bzip2 = { version = "0.4", features = ["tokio"] } bzip2 = { version = "0.4", features = ["tokio"] }
lazy_static = "1.4" lazy_static = "1.4"
indicatif = "0.17"

View file

@ -20,8 +20,8 @@ RUN --mount=type=cache,target=/usr/local/cargo,from=rust,source=/usr/local/cargo
FROM alpine:latest AS main FROM alpine:latest AS main
WORKDIR /wikidata-to-surrealdb # WORKDIR /wikidata-to-surrealdb
COPY --from=builder wikidata-to-surrealdb/wikidata-to-surrealdb . COPY --from=builder wikidata-to-surrealdb/wikidata-to-surrealdb .
CMD ["./wikidata-to-surrealdb"] CMD ["./wikidata-to-surrealdb"]

View file

@ -1,28 +1,51 @@
# Wikidata to SurrealDB
A tool for converting Wikidata dumps to a [SurrealDB](https://surrealdb.com/) database. Either From a bz2 or json file format. A tool for converting Wikidata dumps to a [SurrealDB](https://surrealdb.com/) database. Either From a bz2 or json file format.
# Getting The Data # Getting The Data
https://www.wikidata.org/wiki/Wikidata:Data_access https://www.wikidata.org/wiki/Wikidata:Data_access
## From bz2 file (Recommended) ~80GB ## From bz2 file ~80GB
### Dump: [Docs](https://www.wikidata.org/wiki/Wikidata:Database_download) ### Dump: [Docs](https://www.wikidata.org/wiki/Wikidata:Database_download)
### [Download - latest-all.json.bz2](https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2) ### [Download - latest-all.json.bz2](https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2)
## From json file ## From json file
### Linked Data Interface: [Docs](https://www.wikidata.org/wiki/Wikidata:Data_access#Linked_Data_Interface_(URI)) ### Linked Data Interface: [Docs](https://www.wikidata.org/wiki/Wikidata:Data_access#Linked_Data_Interface_(URI))
``` ```
https://www.wikidata.org/wiki/Special:EntityData/Q60746544.json https://www.wikidata.org/wiki/Special:EntityData/Q60746544.json
https://www.wikidata.org/wiki/Special:EntityData/P527.json https://www.wikidata.org/wiki/Special:EntityData/P527.json
``` ```
# Example .env # Install
Copy docker-compose.yml
Create data folder next to docker-compose.yml and .env, place data inside, and set the data type in .env
```
├── data
│ ├── Entity.json
│ ├── latest-all.json.bz2
│ └── surrealdb
├── docker-compose.yml
└── .env
```
`docker compose up --pull always`
## Example .env
``` ```
DB_USER=root DB_USER=root
DB_PASSWORD=root DB_PASSWORD=root
WIKIDATA_LANG=en WIKIDATA_LANG=en
FILE_FORMAT=bz2 FILE_FORMAT=bz2
FILE_NAME=data/latest-all.json.bz2 FILE_NAME=data/latest-all.json.bz2
# If not using docker file for Wikidata to SurrealDB, use 0.0.0.0:8000
WIKIDATA_DB_PORT=surrealdb:8000
``` ```
## View Progress
`docker attach wikidata-to-surrealdb`
# [Dev Install](./CONTRIBUTING.md#dev-install)
# How to Query # How to Query
## See [Useful queries.md](./Useful%20queries.md) ## See [Useful queries.md](./Useful%20queries.md)

View file

@ -26,13 +26,8 @@ services:
context: . context: .
env_file: env_file:
- .env - .env
environment:
- DB_USER=$DB_USER
- DB_PASSWORD=$DB_PASSWORD
- WIKIDATA_LANG=$WIKIDATA_LANG
- FILE_FORMAT=$FILE_FORMAT
- FILE_NAME=$FILE_NAME
restart: no restart: no
tty: true
depends_on: depends_on:
- surrealdb - surrealdb
volumes: volumes:

View file

@ -25,13 +25,8 @@ services:
image: ghcr.io/nexveridian/ark-invest-api-rust-data:latest image: ghcr.io/nexveridian/ark-invest-api-rust-data:latest
env_file: env_file:
- .env - .env
environment:
- DB_USER=$DB_USER
- DB_PASSWORD=$DB_PASSWORD
- WIKIDATA_LANG=$WIKIDATA_LANG
- FILE_FORMAT=$FILE_FORMAT
- FILE_NAME=$FILE_NAME
restart: no restart: no
tty: true
depends_on: depends_on:
- surrealdb - surrealdb
volumes: volumes:

View file

@ -1,11 +1,15 @@
use anyhow::{Error, Ok, Result}; use anyhow::{Error, Ok, Result};
use bzip2::read::MultiBzDecoder; use bzip2::read::MultiBzDecoder;
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
use lazy_static::lazy_static; use lazy_static::lazy_static;
use serde_json::{from_str, Value}; use serde_json::{from_str, Value};
use std::{ use std::{
env, env,
fmt::Write,
fs::File, fs::File,
io::{BufRead, BufReader}, io::{BufRead, BufReader},
thread,
time::Duration,
}; };
use surrealdb::{engine::remote::ws::Ws, opt::auth::Root, Surreal}; use surrealdb::{engine::remote::ws::Ws, opt::auth::Root, Surreal};
use wikidata::Entity; use wikidata::Entity;
@ -17,8 +21,9 @@ lazy_static! {
#[derive(Debug)] #[derive(Debug)]
static ref DB_USER: String = env::var("DB_USER").expect("DB_USER not set"); static ref DB_USER: String = env::var("DB_USER").expect("DB_USER not set");
static ref DB_PASSWORD: String = env::var("DB_PASSWORD").expect("DB_PASSWORD not set"); static ref DB_PASSWORD: String = env::var("DB_PASSWORD").expect("DB_PASSWORD not set");
static ref FILE_FORMAT: String = env::var("FILE_FORMAT").expect("FILE_FORMAT not set"); static ref WIKIDATA_FILE_FORMAT: String = env::var("WIKIDATA_FILE_FORMAT").expect("FILE_FORMAT not set");
static ref FILE_NAME: String = env::var("FILE_NAME").expect("FILE_NAME not set"); static ref WIKIDATA_FILE_NAME: String = env::var("WIKIDATA_FILE_NAME").expect("FILE_NAME not set");
static ref WIKIDATA_DB_PORT: String = env::var("WIKIDATA_DB_PORT").expect("WIKIDATA_DB_PORT not set");
} }
#[allow(non_camel_case_types)] #[allow(non_camel_case_types)]
@ -45,7 +50,26 @@ impl File_Format {
#[tokio::main] #[tokio::main]
async fn main() -> Result<(), Error> { async fn main() -> Result<(), Error> {
let db = Surreal::new::<Ws>("0.0.0.0:8000").await?; thread::sleep(Duration::from_secs(10));
let mut compleated = 0;
let total_size = 113_000_000;
let pb = ProgressBar::new(total_size);
pb.set_style(
ProgressStyle::with_template(
"[{elapsed_precise}] [{wide_bar:.cyan/blue}] {human_pos}/{human_len} {percent} ETA:{eta}",
)?
.with_key("eta", |state: &ProgressState, w: &mut dyn Write| {
let sec = state.eta().as_secs();
let min = (sec / 60) % 60;
let hr = (sec / 60) / 60;
write!(w, "{}:{:02}:{:02}", hr, min, sec % 60).unwrap()
}),
);
let db = Surreal::new::<Ws>(WIKIDATA_DB_PORT.as_str()).await?;
db.signin(Root { db.signin(Root {
username: &DB_USER, username: &DB_USER,
password: &DB_PASSWORD, password: &DB_PASSWORD,
@ -53,7 +77,7 @@ async fn main() -> Result<(), Error> {
.await?; .await?;
db.use_ns("wikidata").use_db("wikidata").await?; db.use_ns("wikidata").use_db("wikidata").await?;
let reader = File_Format::new(&FILE_FORMAT).reader(&FILE_NAME)?; let reader = File_Format::new(&WIKIDATA_FILE_FORMAT).reader(&WIKIDATA_FILE_NAME)?;
for line in reader.lines() { for line in reader.lines() {
let line = line?.trim().trim_end_matches(',').to_string(); let line = line?.trim().trim_end_matches(',').to_string();
@ -75,7 +99,13 @@ async fn main() -> Result<(), Error> {
claims.id = None; claims.id = None;
let _: Option<Claims> = db.delete(&id).await?; let _: Option<Claims> = db.delete(&id).await?;
let _: Option<Claims> = db.create(&id).content(claims).await?; let _: Option<Claims> = db.create(&id).content(claims).await?;
compleated += 1;
if compleated % 1000 == 0 {
pb.set_position(compleated);
}
} }
pb.finish_with_message("Done parsing Wikidata");
Ok(()) Ok(())
} }

View file

@ -125,8 +125,8 @@ fn get_name(entity: &Entity) -> String {
entity entity
.labels .labels
.get(&Lang(WIKIDATA_LANG.to_string())) .get(&Lang(WIKIDATA_LANG.to_string()))
.expect("No label found") .map(|label| label.to_string())
.to_string() .unwrap_or_default()
} }
fn get_description(entity: &Entity) -> Option<String> { fn get_description(entity: &Entity) -> Option<String> {