diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7764671..65206e3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,6 +2,25 @@ - Make sure the test pass - Run `cargo clippy --fix --allow-dirty` +# Dev Install +## Dev Containers +Install docker, vscode and the [Dev Containers Extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) + +`git clone` + +`Ctrl+Shift+P` **Dev Containers: Open Folder in Container** + +Run code with `F5` or `cargo run` + +Run tests with `cargo t` + +## Docker Compose +`git clone` + +`docker compose -f docker-compose.dev.yml build && docker compose -f docker-compose.dev.yml up` + +Remove the cargo cache for buildkit with `docker builder prune --filter type=exec.cachemount` + # License All code in this repository is dual-licensed under either [License-MIT](./LICENSE-MIT) or [LICENSE-APACHE](./LICENSE-Apache) at your option. This means you can select the license you prefer. diff --git a/DockerFile b/DockerFile new file mode 100644 index 0000000..361fe1c --- /dev/null +++ b/DockerFile @@ -0,0 +1,27 @@ +FROM rust:bookworm AS builder + +RUN apt-get update && \ + apt install -y musl-tools musl-dev libssl-dev clang mold + +# RUN curl -LsSf https://get.nexte.st/latest/linux | tar zxf - -C ${CARGO_HOME:-~/.cargo}/bin +RUN curl -LsSf https://get.nexte.st/latest/linux | tar zxf - -C /usr/local/bin +# RUN cargo install cargo-nextest --locked + +WORKDIR /wikidata-to-surrealdb + +COPY . . + +RUN rustup target add x86_64-unknown-linux-musl && rustup update && cargo update + +RUN --mount=type=cache,target=/usr/local/cargo,from=rust,source=/usr/local/cargo \ + --mount=type=cache,target=./target \ + cargo build --target x86_64-unknown-linux-musl --release && \ + cp ./target/target/x86_64-unknown-linux-musl/release/wikidata-to-surrealdb . + +FROM alpine:latest AS main + +WORKDIR /wikidata-to-surrealdb + +COPY --from=builder wikidata-to-surrealdb/wikidata-to-surrealdb . + +CMD ["./wikidata-to-surrealdb"] \ No newline at end of file diff --git a/README.md b/README.md index b353d41..f7b869e 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,77 @@ -# Similar Libraries +A tool for converting Wikidata dumps to a [SurrealDB](https://surrealdb.com/) database. Either From a bz2 or json file format. + +# Getting The Data +https://www.wikidata.org/wiki/Wikidata:Data_access + +## From bz2 file (Recommended) ~80GB +### Dump: [Docs](https://www.wikidata.org/wiki/Wikidata:Database_download) +### [Download - latest-all.json.bz2](https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2) + +## From json file +### Linked Data Interface: [Docs](https://www.wikidata.org/wiki/Wikidata:Data_access#Linked_Data_Interface_(URI)) +``` +https://www.wikidata.org/wiki/Special:EntityData/Q60746544.json +https://www.wikidata.org/wiki/Special:EntityData/P527.json +``` + +# Example .env +``` +DB_USER=root +DB_PASSWORD=root +WIKIDATA_LANG=en +FILE_FORMAT=bz2 +FILE_NAME=data/latest-all.json.bz2 +``` + +# How to Query +## See [Useful queries.md](./Useful%20queries.md) + +# Table Layout +## Thing +```rust +pub struct Thing { + pub table: String, + pub id: Id, +} +``` + +## Table: Entity, Property, Lexeme +```rust +pub struct EntityMini { + pub id: Option, + pub label: String, + pub claims: Thing, + pub description: String, +} +``` + +## Table: Claims +```rust +pub struct Claims { + pub id: Option, + pub claims: Vec, +} +``` + +## Table: Claim +```rust +pub struct Claim { + pub id: Thing, + pub value: ClaimData, +} +``` + +## ClaimData +```rust +pub enum ClaimData { + Thing(Thing), + ClaimValueData(ClaimValueData), +} +``` + +# Similar Projects - [wd2duckdb](https://github.com/weso/wd2duckdb) - [wd2sql](https://github.com/p-e-w/wd2sql) # License -All code in this repository is dual-licensed under either [License-MIT](./LICENSE-MIT) or [LICENSE-APACHE](./LICENSE-Apache) at your option. This means you can select the license you prefer. - -[Why dual license](https://github.com/bevyengine/bevy/issues/2373) +All code in this repository is dual-licensed under either [License-MIT](./LICENSE-MIT) or [LICENSE-APACHE](./LICENSE-Apache) at your option. This means you can select the license you prefer. [Why dual license](https://github.com/bevyengine/bevy/issues/2373). diff --git a/Useful queries.md b/Useful queries.md index 85cfdca..371f9ec 100644 --- a/Useful queries.md +++ b/Useful queries.md @@ -1,11 +1,15 @@ # Get number of episodes +``` let $number_of_episodes = (select claims.claims[where id = Property:1113][0].value.ClaimValueData.Quantity.amount as number_of_episodes from Entity where label = "Black Clover, season 1")[0].number_of_episodes; return $number_of_episodes[0].number_of_episodes; update Entity SET number_of_episodes=$number_of_episodes where label = "Black Clover, season 1"; +``` # Get Parts +``` let $parts = (select claims.claims[where id = Property:527].value.Thing as parts from Entity where label = "Black Clover")[0].parts; return $parts; +``` diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml new file mode 100644 index 0000000..6cdfb90 --- /dev/null +++ b/docker-compose.dev.yml @@ -0,0 +1,30 @@ +version: "3" +services: + surrealdb: + container_name: surrealdb + image: surrealdb/surrealdb:latest + env_file: + - .env + entrypoint: + - /surreal + - start + - --user + - $DB_USER + - --pass + - $DB_PASSWORD + - file:/data/surrealdb + ports: + - 8000:8000 + volumes: + - ./data:/data + + wikidata-to-surrealdb: + container_name: wikidata-to-surrealdb + build: + context: . + restart: unless-stopped + volumes: + - ./data:/data + +volumes: + data: diff --git a/docker-compose.yml b/docker-compose.yml index add4ee3..15143df 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,8 +1,8 @@ version: "3" services: surrealdb: - image: surrealdb/surrealdb:latest container_name: surrealdb + image: surrealdb/surrealdb:latest env_file: - .env entrypoint: @@ -18,5 +18,12 @@ services: volumes: - ./data:/data + wikidata-to-surrealdb: + container_name: wikidata-to-surrealdb + image: ghcr.io/nexveridian/ark-invest-api-rust-data:latest + restart: unless-stopped + volumes: + - ./data:/data + volumes: data: diff --git a/src/utils.rs b/src/utils.rs index b947dee..9cb8e20 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -31,18 +31,21 @@ impl ClaimData { #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Claims { + // Table: Claims pub id: Option, pub claims: Vec, } #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Claim { + // Table: Claim pub id: Thing, pub value: ClaimData, } #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct EntityMini { + // Table: Entity, Property, Lexeme pub id: Option, pub label: String, pub claims: Thing,