mirror of
https://github.com/NexVeridian/wikidata-to-surrealdb.git
synced 2025-09-02 09:59:13 +00:00
progress bar, fix docker
This commit is contained in:
parent
8b3311ff21
commit
ee15b46ae2
8 changed files with 71 additions and 25 deletions
|
@ -4,8 +4,10 @@
|
||||||
/data
|
/data
|
||||||
/target
|
/target
|
||||||
.dockerignore
|
.dockerignore
|
||||||
|
.env
|
||||||
|
.gitignore
|
||||||
Cargo.lock
|
Cargo.lock
|
||||||
Dockerfile
|
|
||||||
docker-compose.yml
|
|
||||||
docker-compose.dev.yml
|
docker-compose.dev.yml
|
||||||
|
docker-compose.yml
|
||||||
|
dockerfile
|
||||||
*.md
|
*.md
|
||||||
|
|
|
@ -13,3 +13,4 @@ tokio = "1.35"
|
||||||
wikidata = "0.3.1"
|
wikidata = "0.3.1"
|
||||||
bzip2 = { version = "0.4", features = ["tokio"] }
|
bzip2 = { version = "0.4", features = ["tokio"] }
|
||||||
lazy_static = "1.4"
|
lazy_static = "1.4"
|
||||||
|
indicatif = "0.17"
|
||||||
|
|
|
@ -20,7 +20,7 @@ RUN --mount=type=cache,target=/usr/local/cargo,from=rust,source=/usr/local/cargo
|
||||||
|
|
||||||
FROM alpine:latest AS main
|
FROM alpine:latest AS main
|
||||||
|
|
||||||
WORKDIR /wikidata-to-surrealdb
|
# WORKDIR /wikidata-to-surrealdb
|
||||||
|
|
||||||
COPY --from=builder wikidata-to-surrealdb/wikidata-to-surrealdb .
|
COPY --from=builder wikidata-to-surrealdb/wikidata-to-surrealdb .
|
||||||
|
|
||||||
|
|
27
README.md
27
README.md
|
@ -1,9 +1,10 @@
|
||||||
|
# Wikidata to SurrealDB
|
||||||
A tool for converting Wikidata dumps to a [SurrealDB](https://surrealdb.com/) database. Either From a bz2 or json file format.
|
A tool for converting Wikidata dumps to a [SurrealDB](https://surrealdb.com/) database. Either From a bz2 or json file format.
|
||||||
|
|
||||||
# Getting The Data
|
# Getting The Data
|
||||||
https://www.wikidata.org/wiki/Wikidata:Data_access
|
https://www.wikidata.org/wiki/Wikidata:Data_access
|
||||||
|
|
||||||
## From bz2 file (Recommended) ~80GB
|
## From bz2 file ~80GB
|
||||||
### Dump: [Docs](https://www.wikidata.org/wiki/Wikidata:Database_download)
|
### Dump: [Docs](https://www.wikidata.org/wiki/Wikidata:Database_download)
|
||||||
### [Download - latest-all.json.bz2](https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2)
|
### [Download - latest-all.json.bz2](https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2)
|
||||||
|
|
||||||
|
@ -14,15 +15,37 @@ https://www.wikidata.org/wiki/Special:EntityData/Q60746544.json
|
||||||
https://www.wikidata.org/wiki/Special:EntityData/P527.json
|
https://www.wikidata.org/wiki/Special:EntityData/P527.json
|
||||||
```
|
```
|
||||||
|
|
||||||
# Example .env
|
# Install
|
||||||
|
Copy docker-compose.yml
|
||||||
|
|
||||||
|
Create data folder next to docker-compose.yml and .env, place data inside, and set the data type in .env
|
||||||
|
```
|
||||||
|
├── data
|
||||||
|
│ ├── Entity.json
|
||||||
|
│ ├── latest-all.json.bz2
|
||||||
|
│ └── surrealdb
|
||||||
|
├── docker-compose.yml
|
||||||
|
└── .env
|
||||||
|
```
|
||||||
|
|
||||||
|
`docker compose up --pull always`
|
||||||
|
|
||||||
|
## Example .env
|
||||||
```
|
```
|
||||||
DB_USER=root
|
DB_USER=root
|
||||||
DB_PASSWORD=root
|
DB_PASSWORD=root
|
||||||
WIKIDATA_LANG=en
|
WIKIDATA_LANG=en
|
||||||
FILE_FORMAT=bz2
|
FILE_FORMAT=bz2
|
||||||
FILE_NAME=data/latest-all.json.bz2
|
FILE_NAME=data/latest-all.json.bz2
|
||||||
|
# If not using docker file for Wikidata to SurrealDB, use 0.0.0.0:8000
|
||||||
|
WIKIDATA_DB_PORT=surrealdb:8000
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## View Progress
|
||||||
|
`docker attach wikidata-to-surrealdb`
|
||||||
|
|
||||||
|
# [Dev Install](./CONTRIBUTING.md#dev-install)
|
||||||
|
|
||||||
# How to Query
|
# How to Query
|
||||||
## See [Useful queries.md](./Useful%20queries.md)
|
## See [Useful queries.md](./Useful%20queries.md)
|
||||||
|
|
||||||
|
|
|
@ -26,13 +26,8 @@ services:
|
||||||
context: .
|
context: .
|
||||||
env_file:
|
env_file:
|
||||||
- .env
|
- .env
|
||||||
environment:
|
|
||||||
- DB_USER=$DB_USER
|
|
||||||
- DB_PASSWORD=$DB_PASSWORD
|
|
||||||
- WIKIDATA_LANG=$WIKIDATA_LANG
|
|
||||||
- FILE_FORMAT=$FILE_FORMAT
|
|
||||||
- FILE_NAME=$FILE_NAME
|
|
||||||
restart: no
|
restart: no
|
||||||
|
tty: true
|
||||||
depends_on:
|
depends_on:
|
||||||
- surrealdb
|
- surrealdb
|
||||||
volumes:
|
volumes:
|
||||||
|
|
|
@ -25,13 +25,8 @@ services:
|
||||||
image: ghcr.io/nexveridian/ark-invest-api-rust-data:latest
|
image: ghcr.io/nexveridian/ark-invest-api-rust-data:latest
|
||||||
env_file:
|
env_file:
|
||||||
- .env
|
- .env
|
||||||
environment:
|
|
||||||
- DB_USER=$DB_USER
|
|
||||||
- DB_PASSWORD=$DB_PASSWORD
|
|
||||||
- WIKIDATA_LANG=$WIKIDATA_LANG
|
|
||||||
- FILE_FORMAT=$FILE_FORMAT
|
|
||||||
- FILE_NAME=$FILE_NAME
|
|
||||||
restart: no
|
restart: no
|
||||||
|
tty: true
|
||||||
depends_on:
|
depends_on:
|
||||||
- surrealdb
|
- surrealdb
|
||||||
volumes:
|
volumes:
|
||||||
|
|
38
src/main.rs
38
src/main.rs
|
@ -1,11 +1,15 @@
|
||||||
use anyhow::{Error, Ok, Result};
|
use anyhow::{Error, Ok, Result};
|
||||||
use bzip2::read::MultiBzDecoder;
|
use bzip2::read::MultiBzDecoder;
|
||||||
|
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use serde_json::{from_str, Value};
|
use serde_json::{from_str, Value};
|
||||||
use std::{
|
use std::{
|
||||||
env,
|
env,
|
||||||
|
fmt::Write,
|
||||||
fs::File,
|
fs::File,
|
||||||
io::{BufRead, BufReader},
|
io::{BufRead, BufReader},
|
||||||
|
thread,
|
||||||
|
time::Duration,
|
||||||
};
|
};
|
||||||
use surrealdb::{engine::remote::ws::Ws, opt::auth::Root, Surreal};
|
use surrealdb::{engine::remote::ws::Ws, opt::auth::Root, Surreal};
|
||||||
use wikidata::Entity;
|
use wikidata::Entity;
|
||||||
|
@ -17,8 +21,9 @@ lazy_static! {
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
static ref DB_USER: String = env::var("DB_USER").expect("DB_USER not set");
|
static ref DB_USER: String = env::var("DB_USER").expect("DB_USER not set");
|
||||||
static ref DB_PASSWORD: String = env::var("DB_PASSWORD").expect("DB_PASSWORD not set");
|
static ref DB_PASSWORD: String = env::var("DB_PASSWORD").expect("DB_PASSWORD not set");
|
||||||
static ref FILE_FORMAT: String = env::var("FILE_FORMAT").expect("FILE_FORMAT not set");
|
static ref WIKIDATA_FILE_FORMAT: String = env::var("WIKIDATA_FILE_FORMAT").expect("FILE_FORMAT not set");
|
||||||
static ref FILE_NAME: String = env::var("FILE_NAME").expect("FILE_NAME not set");
|
static ref WIKIDATA_FILE_NAME: String = env::var("WIKIDATA_FILE_NAME").expect("FILE_NAME not set");
|
||||||
|
static ref WIKIDATA_DB_PORT: String = env::var("WIKIDATA_DB_PORT").expect("WIKIDATA_DB_PORT not set");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(non_camel_case_types)]
|
#[allow(non_camel_case_types)]
|
||||||
|
@ -45,7 +50,26 @@ impl File_Format {
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<(), Error> {
|
async fn main() -> Result<(), Error> {
|
||||||
let db = Surreal::new::<Ws>("0.0.0.0:8000").await?;
|
thread::sleep(Duration::from_secs(10));
|
||||||
|
|
||||||
|
let mut compleated = 0;
|
||||||
|
let total_size = 113_000_000;
|
||||||
|
|
||||||
|
let pb = ProgressBar::new(total_size);
|
||||||
|
pb.set_style(
|
||||||
|
ProgressStyle::with_template(
|
||||||
|
"[{elapsed_precise}] [{wide_bar:.cyan/blue}] {human_pos}/{human_len} {percent} ETA:{eta}",
|
||||||
|
)?
|
||||||
|
.with_key("eta", |state: &ProgressState, w: &mut dyn Write| {
|
||||||
|
let sec = state.eta().as_secs();
|
||||||
|
let min = (sec / 60) % 60;
|
||||||
|
let hr = (sec / 60) / 60;
|
||||||
|
write!(w, "{}:{:02}:{:02}", hr, min, sec % 60).unwrap()
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
let db = Surreal::new::<Ws>(WIKIDATA_DB_PORT.as_str()).await?;
|
||||||
|
|
||||||
db.signin(Root {
|
db.signin(Root {
|
||||||
username: &DB_USER,
|
username: &DB_USER,
|
||||||
password: &DB_PASSWORD,
|
password: &DB_PASSWORD,
|
||||||
|
@ -53,7 +77,7 @@ async fn main() -> Result<(), Error> {
|
||||||
.await?;
|
.await?;
|
||||||
db.use_ns("wikidata").use_db("wikidata").await?;
|
db.use_ns("wikidata").use_db("wikidata").await?;
|
||||||
|
|
||||||
let reader = File_Format::new(&FILE_FORMAT).reader(&FILE_NAME)?;
|
let reader = File_Format::new(&WIKIDATA_FILE_FORMAT).reader(&WIKIDATA_FILE_NAME)?;
|
||||||
|
|
||||||
for line in reader.lines() {
|
for line in reader.lines() {
|
||||||
let line = line?.trim().trim_end_matches(',').to_string();
|
let line = line?.trim().trim_end_matches(',').to_string();
|
||||||
|
@ -75,7 +99,13 @@ async fn main() -> Result<(), Error> {
|
||||||
claims.id = None;
|
claims.id = None;
|
||||||
let _: Option<Claims> = db.delete(&id).await?;
|
let _: Option<Claims> = db.delete(&id).await?;
|
||||||
let _: Option<Claims> = db.create(&id).content(claims).await?;
|
let _: Option<Claims> = db.create(&id).content(claims).await?;
|
||||||
|
|
||||||
|
compleated += 1;
|
||||||
|
if compleated % 1000 == 0 {
|
||||||
|
pb.set_position(compleated);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pb.finish_with_message("Done parsing Wikidata");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
|
@ -125,8 +125,8 @@ fn get_name(entity: &Entity) -> String {
|
||||||
entity
|
entity
|
||||||
.labels
|
.labels
|
||||||
.get(&Lang(WIKIDATA_LANG.to_string()))
|
.get(&Lang(WIKIDATA_LANG.to_string()))
|
||||||
.expect("No label found")
|
.map(|label| label.to_string())
|
||||||
.to_string()
|
.unwrap_or_default()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_description(entity: &Entity) -> Option<String> {
|
fn get_description(entity: &Entity) -> Option<String> {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue