This commit is contained in:
Elijah McMorris 2023-12-15 07:38:39 +00:00
parent 7bf0033970
commit 44b66d43c1
Signed by: NexVeridian
SSH key fingerprint: SHA256:bsA1SKZxuEcEVHAy3gY1HUeM5ykRJl0U0kQHQn0hMg8
7 changed files with 164 additions and 5 deletions

View file

@ -2,6 +2,25 @@
- Make sure the test pass - Make sure the test pass
- Run `cargo clippy --fix --allow-dirty` - Run `cargo clippy --fix --allow-dirty`
# Dev Install
## Dev Containers
Install docker, vscode and the [Dev Containers Extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
`git clone`
`Ctrl+Shift+P` **Dev Containers: Open Folder in Container**
Run code with `F5` or `cargo run`
Run tests with `cargo t`
## Docker Compose
`git clone`
`docker compose -f docker-compose.dev.yml build && docker compose -f docker-compose.dev.yml up`
Remove the cargo cache for buildkit with `docker builder prune --filter type=exec.cachemount`
# License # License
All code in this repository is dual-licensed under either [License-MIT](./LICENSE-MIT) or [LICENSE-APACHE](./LICENSE-Apache) at your option. This means you can select the license you prefer. All code in this repository is dual-licensed under either [License-MIT](./LICENSE-MIT) or [LICENSE-APACHE](./LICENSE-Apache) at your option. This means you can select the license you prefer.

27
DockerFile Normal file
View file

@ -0,0 +1,27 @@
FROM rust:bookworm AS builder
RUN apt-get update && \
apt install -y musl-tools musl-dev libssl-dev clang mold
# RUN curl -LsSf https://get.nexte.st/latest/linux | tar zxf - -C ${CARGO_HOME:-~/.cargo}/bin
RUN curl -LsSf https://get.nexte.st/latest/linux | tar zxf - -C /usr/local/bin
# RUN cargo install cargo-nextest --locked
WORKDIR /wikidata-to-surrealdb
COPY . .
RUN rustup target add x86_64-unknown-linux-musl && rustup update && cargo update
RUN --mount=type=cache,target=/usr/local/cargo,from=rust,source=/usr/local/cargo \
--mount=type=cache,target=./target \
cargo build --target x86_64-unknown-linux-musl --release && \
cp ./target/target/x86_64-unknown-linux-musl/release/wikidata-to-surrealdb .
FROM alpine:latest AS main
WORKDIR /wikidata-to-surrealdb
COPY --from=builder wikidata-to-surrealdb/wikidata-to-surrealdb .
CMD ["./wikidata-to-surrealdb"]

View file

@ -1,8 +1,77 @@
# Similar Libraries A tool for converting Wikidata dumps to a [SurrealDB](https://surrealdb.com/) database. Either From a bz2 or json file format.
# Getting The Data
https://www.wikidata.org/wiki/Wikidata:Data_access
## From bz2 file (Recommended) ~80GB
### Dump: [Docs](https://www.wikidata.org/wiki/Wikidata:Database_download)
### [Download - latest-all.json.bz2](https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2)
## From json file
### Linked Data Interface: [Docs](https://www.wikidata.org/wiki/Wikidata:Data_access#Linked_Data_Interface_(URI))
```
https://www.wikidata.org/wiki/Special:EntityData/Q60746544.json
https://www.wikidata.org/wiki/Special:EntityData/P527.json
```
# Example .env
```
DB_USER=root
DB_PASSWORD=root
WIKIDATA_LANG=en
FILE_FORMAT=bz2
FILE_NAME=data/latest-all.json.bz2
```
# How to Query
## See [Useful queries.md](./Useful%20queries.md)
# Table Layout
## Thing
```rust
pub struct Thing {
pub table: String,
pub id: Id,
}
```
## Table: Entity, Property, Lexeme
```rust
pub struct EntityMini {
pub id: Option<Thing>,
pub label: String,
pub claims: Thing,
pub description: String,
}
```
## Table: Claims
```rust
pub struct Claims {
pub id: Option<Thing>,
pub claims: Vec<Claim>,
}
```
## Table: Claim
```rust
pub struct Claim {
pub id: Thing,
pub value: ClaimData,
}
```
## ClaimData
```rust
pub enum ClaimData {
Thing(Thing),
ClaimValueData(ClaimValueData),
}
```
# Similar Projects
- [wd2duckdb](https://github.com/weso/wd2duckdb) - [wd2duckdb](https://github.com/weso/wd2duckdb)
- [wd2sql](https://github.com/p-e-w/wd2sql) - [wd2sql](https://github.com/p-e-w/wd2sql)
# License # License
All code in this repository is dual-licensed under either [License-MIT](./LICENSE-MIT) or [LICENSE-APACHE](./LICENSE-Apache) at your option. This means you can select the license you prefer. All code in this repository is dual-licensed under either [License-MIT](./LICENSE-MIT) or [LICENSE-APACHE](./LICENSE-Apache) at your option. This means you can select the license you prefer. [Why dual license](https://github.com/bevyengine/bevy/issues/2373).
[Why dual license](https://github.com/bevyengine/bevy/issues/2373)

View file

@ -1,11 +1,15 @@
# Get number of episodes # Get number of episodes
```
let $number_of_episodes = (select claims.claims[where id = Property:1113][0].value.ClaimValueData.Quantity.amount as number_of_episodes from Entity where label = "Black Clover, season 1")[0].number_of_episodes; let $number_of_episodes = (select claims.claims[where id = Property:1113][0].value.ClaimValueData.Quantity.amount as number_of_episodes from Entity where label = "Black Clover, season 1")[0].number_of_episodes;
return $number_of_episodes[0].number_of_episodes; return $number_of_episodes[0].number_of_episodes;
update Entity SET number_of_episodes=$number_of_episodes where label = "Black Clover, season 1"; update Entity SET number_of_episodes=$number_of_episodes where label = "Black Clover, season 1";
```
# Get Parts # Get Parts
```
let $parts = (select claims.claims[where id = Property:527].value.Thing as parts from Entity where label = "Black Clover")[0].parts; let $parts = (select claims.claims[where id = Property:527].value.Thing as parts from Entity where label = "Black Clover")[0].parts;
return $parts; return $parts;
```

30
docker-compose.dev.yml Normal file
View file

@ -0,0 +1,30 @@
version: "3"
services:
surrealdb:
container_name: surrealdb
image: surrealdb/surrealdb:latest
env_file:
- .env
entrypoint:
- /surreal
- start
- --user
- $DB_USER
- --pass
- $DB_PASSWORD
- file:/data/surrealdb
ports:
- 8000:8000
volumes:
- ./data:/data
wikidata-to-surrealdb:
container_name: wikidata-to-surrealdb
build:
context: .
restart: unless-stopped
volumes:
- ./data:/data
volumes:
data:

View file

@ -1,8 +1,8 @@
version: "3" version: "3"
services: services:
surrealdb: surrealdb:
image: surrealdb/surrealdb:latest
container_name: surrealdb container_name: surrealdb
image: surrealdb/surrealdb:latest
env_file: env_file:
- .env - .env
entrypoint: entrypoint:
@ -18,5 +18,12 @@ services:
volumes: volumes:
- ./data:/data - ./data:/data
wikidata-to-surrealdb:
container_name: wikidata-to-surrealdb
image: ghcr.io/nexveridian/ark-invest-api-rust-data:latest
restart: unless-stopped
volumes:
- ./data:/data
volumes: volumes:
data: data:

View file

@ -31,18 +31,21 @@ impl ClaimData {
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Claims { pub struct Claims {
// Table: Claims
pub id: Option<Thing>, pub id: Option<Thing>,
pub claims: Vec<Claim>, pub claims: Vec<Claim>,
} }
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Claim { pub struct Claim {
// Table: Claim
pub id: Thing, pub id: Thing,
pub value: ClaimData, pub value: ClaimData,
} }
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct EntityMini { pub struct EntityMini {
// Table: Entity, Property, Lexeme
pub id: Option<Thing>, pub id: Option<Thing>,
pub label: String, pub label: String,
pub claims: Thing, pub claims: Thing,