diff --git a/.github/workflows/crane.yml b/.github/workflows/crane.yml index 21f6c9e..0a4861a 100644 --- a/.github/workflows/crane.yml +++ b/.github/workflows/crane.yml @@ -13,6 +13,7 @@ on: env: CARGO_TERM_COLOR: always + NIX_CONFIG: 'download-buffer-size = 5000MB' jobs: check: diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 21e1cb6..2765ea4 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -20,6 +20,7 @@ env: REGISTRY: ghcr.io # github.repository as / IMAGE_NAME: ${{ github.repository }} + NIX_CONFIG: 'download-buffer-size = 5000MB' jobs: build: diff --git a/.gitignore b/.gitignore index e70033f..d756134 100644 --- a/.gitignore +++ b/.gitignore @@ -105,5 +105,3 @@ venv.bak/ /data /target - -filter.surql diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 42d9ef0..8944096 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -23,7 +23,7 @@ Run tests with `nix flake check` or `cargo t` Remove the cargo cache for buildkit with `docker builder prune --filter type=exec.cachemount` ### View Progress -`docker attach wikidata-to-surrealdb` +`make view` # License All code in this repository is dual-licensed under either [License-MIT](./LICENSE-MIT) or [LICENSE-APACHE](./LICENSE-Apache) at your option. This means you can select the license you prefer. [Why dual license](https://github.com/bevyengine/bevy/issues/2373). diff --git a/Cargo.lock b/Cargo.lock index 6f462ad..4013e00 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3438,6 +3438,7 @@ dependencies = [ "thiserror", "tokio", "tokio-tungstenite", + "tokio-util", "tracing", "trice", "url", @@ -3818,6 +3819,7 @@ checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" dependencies = [ "bytes 1.7.1", "futures-core", + "futures-io", "futures-sink", "pin-project-lite", "tokio", diff --git a/Cargo.toml b/Cargo.toml index 790faac..5aa4eb7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,7 @@ license = "MIT OR Apache-2.0" anyhow = "1.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" -surrealdb-alpha = { version = "2.0.9", features = ["protocol-ws", "kv-mem"] } +surrealdb-alpha = { version = "2.0.9", features = ["protocol-http", "kv-mem"] } tokio = { version = "1.39", features = ["fs", "time"] } futures = "0.3" wikidata = "1.1" diff --git a/README.md b/README.md index 426a0f5..d9f1b8f 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,9 @@ Create data folder next to docker-compose.yml and .env, place data inside, and s ├── data │ ├── Entity.json │ ├── latest-all.json.bz2 -│ └── surrealdb +│ ├── filter.surql +│ ├── surrealdb +│ └── temp ├── Makefile ├── docker-compose.yml └── .env @@ -54,7 +56,7 @@ WIKIDATA_DB_PORT=surrealdb:8000 # true=overwrite existing data, false=skip if already exists OVERWRITE_DB=false CREATE_VERSION=Bulk -#FILTER_PATH=../filter.surql +#FILTER_PATH=data/filter.surql ``` Env string CREATE_VERSION must be in the enum CREATE_VERSION diff --git a/src/main.rs b/src/main.rs index 30f8f1f..7616250 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,7 @@ use anyhow::{Error, Ok, Result}; use lazy_static::lazy_static; use std::env; -use surrealdb::{engine::remote::ws::Client, Surreal}; +use surrealdb::{engine::remote::http::Client, Surreal}; use tokio::time::{sleep, Duration}; mod utils; @@ -30,6 +30,10 @@ async fn main() -> Result<(), Error> { let pb = init_progress_bar::create_pb().await; let reader = File_Format::new(&WIKIDATA_FILE_FORMAT).reader(&WIKIDATA_FILE_NAME)?; + tokio::fs::create_dir_all("data/temp").await?; + tokio::fs::remove_dir_all("data/temp").await?; + tokio::fs::create_dir_all("data/temp").await?; + CREATE_VERSION .run( None::>, diff --git a/src/utils.rs b/src/utils.rs index 50ca0fd..d238202 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,4 +1,5 @@ use anyhow::{Error, Result}; +use core::panic; use futures::future::join_all; use indicatif::ProgressBar; use lazy_static::lazy_static; @@ -21,7 +22,7 @@ lazy_static! { .parse() .expect("Failed to parse OVERWRITE_DB"); static ref FILTER_PATH: String = - env::var("FILTER_PATH").unwrap_or("../filter.surql".to_string()); + env::var("FILTER_PATH").unwrap_or("data/filter.surql".to_string()); } pub async fn create_entity(db: &Surreal, line: &str) -> Result<(), Error> { @@ -108,7 +109,7 @@ impl CreateVersion { } } None => { - let db = match init_db::create_db_ws().await { + let db = match init_db::create_db_remote().await { Ok(db) => db, Err(_) => continue, }; @@ -141,6 +142,12 @@ impl CreateVersion { .create_bulk_filter(db, chunk, pb, batch_size) .await .is_ok(), + // CreateVersion::BulkFilter => { + // if let Err(err) = self.create_bulk_filter(db, chunk, pb, batch_size).await { + // panic!("Failed to create entities: {}", err); + // } + // true + // } } } @@ -233,7 +240,6 @@ impl CreateVersion { .collect(); let file_path = format!("data/temp/{}.surql", file_name); - tokio::fs::create_dir_all("data/temp").await?; db_mem.export(&file_path).await?; db.import(&file_path).await?; diff --git a/src/utils/init_db.rs b/src/utils/init_db.rs index 9ffa1bb..4733de1 100644 --- a/src/utils/init_db.rs +++ b/src/utils/init_db.rs @@ -5,7 +5,7 @@ use std::env; use surrealdb::{ engine::{ local::{Db, Mem}, - remote::ws::{Client, Ws}, + remote::http::{Client, Http}, }, opt::auth::Root, Surreal, @@ -18,8 +18,8 @@ lazy_static! { env::var("WIKIDATA_DB_PORT").expect("WIKIDATA_DB_PORT not set"); } -pub async fn create_db_ws() -> Result, Error> { - let db = Surreal::new::(WIKIDATA_DB_PORT.as_str()).await?; +pub async fn create_db_remote() -> Result, Error> { + let db = Surreal::new::(WIKIDATA_DB_PORT.as_str()).await?; db.signin(Root { username: &DB_USER, diff --git a/src/utils/init_progress_bar.rs b/src/utils/init_progress_bar.rs index 2ae849f..cb963dc 100644 --- a/src/utils/init_progress_bar.rs +++ b/src/utils/init_progress_bar.rs @@ -1,7 +1,7 @@ use indicatif::{ProgressBar, ProgressState, ProgressStyle}; pub async fn create_pb() -> ProgressBar { - let total_size = 110_000_000; + let total_size = 112_500_000; let pb = ProgressBar::new(total_size); pb.set_style( ProgressStyle::with_template( diff --git a/tests/data/test_filter.surql b/tests/data/test_filter.surql index 67559ea..15a95c0 100644 --- a/tests/data/test_filter.surql +++ b/tests/data/test_filter.surql @@ -5,4 +5,4 @@ let $entity = return (select id from $delete).id; let $claims = return (select claims from $delete).claims; delete $claims; -delete $entity; \ No newline at end of file +delete $entity;