mirror of
https://github.com/NexVeridian/wikidata-to-surrealdb.git
synced 2025-09-02 01:49:13 +00:00
tests
This commit is contained in:
parent
2edaeef042
commit
e37d413372
14 changed files with 525 additions and 250 deletions
109
.github/workflows/docker.yml
vendored
Normal file
109
.github/workflows/docker.yml
vendored
Normal file
|
@ -0,0 +1,109 @@
|
||||||
|
name: Docker
|
||||||
|
|
||||||
|
# This workflow uses actions that are not certified by GitHub.
|
||||||
|
# They are provided by a third-party and are governed by
|
||||||
|
# separate terms of service, privacy policy, and support
|
||||||
|
# documentation.
|
||||||
|
on:
|
||||||
|
workflow_run:
|
||||||
|
workflows: [nextest]
|
||||||
|
branches: [main]
|
||||||
|
types:
|
||||||
|
- completed
|
||||||
|
# schedule:
|
||||||
|
# - cron: 0 0 * * 1
|
||||||
|
# push:
|
||||||
|
# branches: [ "main" ]
|
||||||
|
# # Publish semver tags as releases.
|
||||||
|
# tags: [ 'v*.*.*' ]
|
||||||
|
# pull_request:
|
||||||
|
# branches: [ "main" ]
|
||||||
|
|
||||||
|
env:
|
||||||
|
# Use docker.io for Docker Hub if empty
|
||||||
|
REGISTRY: ghcr.io
|
||||||
|
# github.repository as <account>/<repo>
|
||||||
|
IMAGE_NAME: ${{ github.repository }}
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
# https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#running-a-workflow-based-on-the-conclusion-of-another-workflow
|
||||||
|
if: ${{ github.event.workflow_run.conclusion == 'success' }}
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
packages: write
|
||||||
|
# This is used to complete the identity challenge
|
||||||
|
# with sigstore/fulcio when running outside of PRs.
|
||||||
|
id-token: write
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
# Install the cosign tool except on PR
|
||||||
|
# https://github.com/sigstore/cosign-installer
|
||||||
|
- name: Install cosign
|
||||||
|
if: github.event_name != 'pull_request'
|
||||||
|
uses: sigstore/cosign-installer@6e04d228eb30da1757ee4e1dd75a0ec73a653e06 #v3.1.1
|
||||||
|
with:
|
||||||
|
cosign-release: "v2.1.1"
|
||||||
|
|
||||||
|
# Set up BuildKit Docker container builder to be able to build
|
||||||
|
# multi-platform images and export cache
|
||||||
|
# https://github.com/docker/setup-buildx-action
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@f95db51fddba0c2d1ec667646a06c2ce06100226 # v3.0.0
|
||||||
|
|
||||||
|
# Login against a Docker registry except on PR
|
||||||
|
# https://github.com/docker/login-action
|
||||||
|
- name: Log into registry ${{ env.REGISTRY }}
|
||||||
|
if: github.event_name != 'pull_request'
|
||||||
|
uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0
|
||||||
|
with:
|
||||||
|
registry: ${{ env.REGISTRY }}
|
||||||
|
username: ${{ github.actor }}
|
||||||
|
password: ${{ secrets.GHCR_TOKEN }}
|
||||||
|
|
||||||
|
# Extract metadata (tags, labels) for Docker
|
||||||
|
# https://github.com/docker/metadata-action
|
||||||
|
- name: Extract Docker metadata
|
||||||
|
id: meta
|
||||||
|
uses: docker/metadata-action@96383f45573cb7f253c731d3b3ab81c87ef81934 # v5.0.0
|
||||||
|
with:
|
||||||
|
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
||||||
|
|
||||||
|
# https://github.com/orgs/community/discussions/25768#discussioncomment-3249183
|
||||||
|
- name: downcase REPO
|
||||||
|
run: |
|
||||||
|
echo "REPO=${GITHUB_REPOSITORY,,}" >>${GITHUB_ENV}
|
||||||
|
|
||||||
|
# Build and push Docker image with Buildx (don't push on PR)
|
||||||
|
# https://github.com/docker/build-push-action
|
||||||
|
- name: Build and push Docker image
|
||||||
|
id: build-and-push
|
||||||
|
uses: docker/build-push-action@0565240e2d4ab88bba5387d719585280857ece09 # v5.0.0
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
|
# tags: ${{ steps.meta.outputs.tags }}
|
||||||
|
# tags: ${{ env.REGISTRY }}/${{ github.repository }}:latest
|
||||||
|
tags: ${{ env.REGISTRY }}/${{ env.REPO }}:latest
|
||||||
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
|
|
||||||
|
# # Sign the resulting Docker image digest except on PRs.
|
||||||
|
# # This will only write to the public Rekor transparency log when the Docker
|
||||||
|
# # repository is public to avoid leaking data. If you would like to publish
|
||||||
|
# # transparency data even for private images, pass --force to cosign below.
|
||||||
|
# # https://github.com/sigstore/cosign
|
||||||
|
# - name: Sign the published Docker image
|
||||||
|
# if: ${{ github.event_name != 'pull_request' }}
|
||||||
|
# env:
|
||||||
|
# # https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable
|
||||||
|
# TAGS: ${{ steps.meta.outputs.tags }}
|
||||||
|
# DIGEST: ${{ steps.build-and-push.outputs.digest }}
|
||||||
|
# # This step uses the identity token to provision an ephemeral certificate
|
||||||
|
# # against the sigstore community Fulcio instance.
|
||||||
|
# run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST}
|
45
.github/workflows/nextest.yml
vendored
Normal file
45
.github/workflows/nextest.yml
vendored
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
# https://github.com/nextest-rs/reuse-build-partition-example
|
||||||
|
# https://keliris.dev/articles/setup-rust-github-actions
|
||||||
|
|
||||||
|
name: nextest
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
branches: [main]
|
||||||
|
push:
|
||||||
|
branches: [main]
|
||||||
|
schedule:
|
||||||
|
- cron: 0 0 * * 1
|
||||||
|
|
||||||
|
env:
|
||||||
|
CARGO_TERM_COLOR: always
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run-tests:
|
||||||
|
name: run tests
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
# - uses: rui314/setup-mold@v1
|
||||||
|
- name: install mold
|
||||||
|
run: sudo apt-get install -y musl-tools musl-dev libssl-dev clang mold
|
||||||
|
# https://github.com/moonrepo/setup-rust
|
||||||
|
- uses: moonrepo/setup-rust@v1
|
||||||
|
with:
|
||||||
|
bins: cargo-nextest
|
||||||
|
- name: Run tests
|
||||||
|
run: cargo nextest run -E "all() - test(get_api) - kind(bin)"
|
||||||
|
|
||||||
|
clippy:
|
||||||
|
name: clippy
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
# - uses: rui314/setup-mold@v1
|
||||||
|
- name: install mold
|
||||||
|
run: sudo apt-get install -y musl-tools musl-dev libssl-dev clang mold
|
||||||
|
- uses: moonrepo/setup-rust@v1
|
||||||
|
with:
|
||||||
|
components: clippy
|
||||||
|
- name: clippy
|
||||||
|
run: cargo clippy
|
|
@ -15,3 +15,6 @@ wikidata = "0.3.1"
|
||||||
bzip2 = { version = "0.4", features = ["tokio"] }
|
bzip2 = { version = "0.4", features = ["tokio"] }
|
||||||
lazy_static = "1.4"
|
lazy_static = "1.4"
|
||||||
indicatif = "0.17"
|
indicatif = "0.17"
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
surrealdb = { version = "1.0", features = ["kv-mem"] }
|
||||||
|
|
23
README.md
23
README.md
|
@ -1,5 +1,5 @@
|
||||||
# Wikidata to SurrealDB
|
# Wikidata to SurrealDB
|
||||||
A tool for converting Wikidata dumps to a [SurrealDB](https://surrealdb.com/) database. Either From a bz2 or json file format.
|
A tool for converting Wikidata dumps to a [SurrealDB](https://surrealdb.com/) database. Either From a bz2 or json file.
|
||||||
|
|
||||||
# Getting The Data
|
# Getting The Data
|
||||||
https://www.wikidata.org/wiki/Wikidata:Data_access
|
https://www.wikidata.org/wiki/Wikidata:Data_access
|
||||||
|
@ -50,20 +50,21 @@ THREADED_REQUESTS=true
|
||||||
# How to Query
|
# How to Query
|
||||||
## See [Useful queries.md](./Useful%20queries.md)
|
## See [Useful queries.md](./Useful%20queries.md)
|
||||||
|
|
||||||
# Table Layout
|
# Table Schema
|
||||||
## Thing
|
## SurrealDB Thing
|
||||||
```rust
|
```rust
|
||||||
pub struct Thing {
|
pub struct Thing {
|
||||||
pub table: String,
|
pub table: String,
|
||||||
pub id: Id,
|
pub id: Id, // i64
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
## Table: Entity, Property, Lexeme
|
## Tables: Entity, Property, Lexeme
|
||||||
```rust
|
```rust
|
||||||
pub struct EntityMini {
|
pub struct EntityMini {
|
||||||
pub id: Option<Thing>,
|
pub id: Option<Thing>,
|
||||||
pub label: String,
|
pub label: String,
|
||||||
|
// Claims Table
|
||||||
pub claims: Thing,
|
pub claims: Thing,
|
||||||
pub description: String,
|
pub description: String,
|
||||||
}
|
}
|
||||||
|
@ -71,27 +72,21 @@ pub struct EntityMini {
|
||||||
|
|
||||||
## Table: Claims
|
## Table: Claims
|
||||||
```rust
|
```rust
|
||||||
pub struct Claims {
|
|
||||||
pub id: Option<Thing>,
|
|
||||||
pub claims: Vec<Claim>,
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Table: Claim
|
|
||||||
```rust
|
|
||||||
pub struct Claim {
|
pub struct Claim {
|
||||||
pub id: Thing,
|
pub id: Thing,
|
||||||
pub value: ClaimData,
|
pub value: ClaimData,
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
## ClaimData
|
### ClaimData
|
||||||
```rust
|
```rust
|
||||||
pub enum ClaimData {
|
pub enum ClaimData {
|
||||||
|
// Entity, Property, Lexeme Tables
|
||||||
Thing(Thing),
|
Thing(Thing),
|
||||||
ClaimValueData(ClaimValueData),
|
ClaimValueData(ClaimValueData),
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
#### [Docs for ClaimValueData](https://docs.rs/wikidata/0.3.1/wikidata/enum.ClaimValueData.html)
|
||||||
|
|
||||||
# Similar Projects
|
# Similar Projects
|
||||||
- [wd2duckdb](https://github.com/weso/wd2duckdb)
|
- [wd2duckdb](https://github.com/weso/wd2duckdb)
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
```
|
```
|
||||||
let $number_of_episodes = (select claims.claims[where id = Property:1113][0].value.ClaimValueData.Quantity.amount as number_of_episodes from Entity where label = "Black Clover, season 1")[0].number_of_episodes;
|
let $number_of_episodes = (select claims.claims[where id = Property:1113][0].value.ClaimValueData.Quantity.amount as number_of_episodes from Entity where label = "Black Clover, season 1")[0].number_of_episodes;
|
||||||
|
|
||||||
return $number_of_episodes[0].number_of_episodes;
|
return $number_of_episodes;
|
||||||
|
|
||||||
update Entity SET number_of_episodes=$number_of_episodes where label = "Black Clover, season 1";
|
update Entity SET number_of_episodes=$number_of_episodes where label = "Black Clover, season 1";
|
||||||
```
|
```
|
||||||
|
|
|
@ -22,7 +22,7 @@ services:
|
||||||
|
|
||||||
wikidata-to-surrealdb:
|
wikidata-to-surrealdb:
|
||||||
container_name: wikidata-to-surrealdb
|
container_name: wikidata-to-surrealdb
|
||||||
image: ghcr.io/nexveridian/ark-invest-api-rust-data:latest
|
image: ghcr.io/nexveridian/wikidata-to-surrealdb:latest
|
||||||
env_file:
|
env_file:
|
||||||
- .env
|
- .env
|
||||||
restart: no
|
restart: no
|
||||||
|
|
2
src/lib.rs
Normal file
2
src/lib.rs
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
pub mod utils;
|
||||||
|
pub use utils::*;
|
115
src/main.rs
115
src/main.rs
|
@ -1,24 +1,8 @@
|
||||||
use anyhow::{Error, Ok, Result};
|
use anyhow::{Error, Ok, Result};
|
||||||
use bzip2::read::MultiBzDecoder;
|
|
||||||
use futures::future::join_all;
|
|
||||||
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use serde_json::{from_str, Value};
|
use std::{env, fmt::Write, io::BufRead, thread, time::Duration};
|
||||||
use std::{
|
use surrealdb::{engine::remote::ws::Ws, opt::auth::Root, Surreal};
|
||||||
env,
|
|
||||||
fmt::Write,
|
|
||||||
fs::File,
|
|
||||||
io::{BufRead, BufReader},
|
|
||||||
thread,
|
|
||||||
time::Duration,
|
|
||||||
};
|
|
||||||
use surrealdb::{
|
|
||||||
engine::remote::ws::{Client, Ws},
|
|
||||||
opt::auth::Root,
|
|
||||||
Surreal,
|
|
||||||
};
|
|
||||||
use wikidata::Entity;
|
|
||||||
|
|
||||||
mod utils;
|
mod utils;
|
||||||
use utils::*;
|
use utils::*;
|
||||||
|
|
||||||
|
@ -32,71 +16,6 @@ lazy_static! {
|
||||||
static ref THREADED_REQUESTS: bool = env::var("THREADED_REQUESTS").expect("THREADED_REQUESTS not set").parse().expect("Failed to parse THREADED_REQUESTS");
|
static ref THREADED_REQUESTS: bool = env::var("THREADED_REQUESTS").expect("THREADED_REQUESTS not set").parse().expect("Failed to parse THREADED_REQUESTS");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(non_camel_case_types)]
|
|
||||||
enum File_Format {
|
|
||||||
json,
|
|
||||||
bz2,
|
|
||||||
}
|
|
||||||
impl File_Format {
|
|
||||||
fn new(file: &str) -> Self {
|
|
||||||
match file {
|
|
||||||
"json" => Self::json,
|
|
||||||
"bz2" => Self::bz2,
|
|
||||||
_ => panic!("Unknown file format"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn reader(self, file: &str) -> Result<Box<dyn BufRead>, Error> {
|
|
||||||
let file = File::open(file)?;
|
|
||||||
match self {
|
|
||||||
File_Format::json => Ok(Box::new(BufReader::new(file))),
|
|
||||||
File_Format::bz2 => Ok(Box::new(BufReader::new(MultiBzDecoder::new(file)))),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn create_db_entity(db: &Surreal<Client>, line: String) -> Result<(), Error> {
|
|
||||||
let line = line.trim().trim_end_matches(',').to_string();
|
|
||||||
if line == "[" || line == "]" {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
|
|
||||||
let json: Value = from_str(&line)?;
|
|
||||||
let data = Entity::from_json(json).expect("Failed to parse JSON");
|
|
||||||
|
|
||||||
let (mut claims, mut data) = EntityMini::from_entity(data);
|
|
||||||
|
|
||||||
let id = data.id.clone().expect("No ID");
|
|
||||||
data.id = None;
|
|
||||||
let _ = db.create::<Option<EntityMini>>(&id).await.is_err();
|
|
||||||
{
|
|
||||||
db.update::<Option<EntityMini>>(&id).content(data).await?;
|
|
||||||
};
|
|
||||||
|
|
||||||
let id = claims.id.clone().expect("No ID");
|
|
||||||
claims.id = None;
|
|
||||||
let _ = db.create::<Option<Claims>>(&id).await.is_err();
|
|
||||||
{
|
|
||||||
db.update::<Option<Claims>>(&id).content(claims).await?;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn create_db_entities(
|
|
||||||
db: &Surreal<Client>,
|
|
||||||
lines: Vec<String>,
|
|
||||||
pb: ProgressBar,
|
|
||||||
) -> Result<(), Error> {
|
|
||||||
let mut counter = 0;
|
|
||||||
for line in lines {
|
|
||||||
create_db_entity(db, line.to_string()).await?;
|
|
||||||
counter += 1;
|
|
||||||
if counter % 100 == 0 {
|
|
||||||
pb.inc(100);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<(), Error> {
|
async fn main() -> Result<(), Error> {
|
||||||
thread::sleep(Duration::from_secs(10));
|
thread::sleep(Duration::from_secs(10));
|
||||||
|
@ -136,35 +55,7 @@ async fn main() -> Result<(), Error> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
let mut futures = Vec::new();
|
create_db_entities_threaded(&db, reader, Some(pb.clone()), 1000, 100).await?;
|
||||||
let mut chunk = Vec::new();
|
|
||||||
let mut chunk_counter = 0;
|
|
||||||
const BATCH_SIZE: usize = 1000;
|
|
||||||
const BATCH_NUM: usize = 100;
|
|
||||||
|
|
||||||
for line in reader.lines() {
|
|
||||||
chunk.push(line.unwrap());
|
|
||||||
|
|
||||||
if chunk.len() >= BATCH_SIZE {
|
|
||||||
let db = db.clone();
|
|
||||||
let lines = chunk.clone();
|
|
||||||
let pb = pb.clone();
|
|
||||||
|
|
||||||
futures.push(tokio::spawn(async move {
|
|
||||||
create_db_entities(&db, lines, pb).await.unwrap();
|
|
||||||
}));
|
|
||||||
chunk_counter += 1;
|
|
||||||
chunk.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
if chunk_counter >= BATCH_NUM {
|
|
||||||
join_all(futures).await;
|
|
||||||
futures = Vec::new();
|
|
||||||
chunk_counter = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
join_all(futures).await;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pb.finish();
|
pb.finish();
|
||||||
|
|
217
src/utils.rs
217
src/utils.rs
|
@ -1,138 +1,119 @@
|
||||||
use lazy_static::lazy_static;
|
use anyhow::{Error, Ok, Result};
|
||||||
use serde::{Deserialize, Serialize};
|
use bzip2::read::MultiBzDecoder;
|
||||||
use std::env;
|
use futures::future::join_all;
|
||||||
use surrealdb::sql::Thing;
|
use indicatif::ProgressBar;
|
||||||
use wikidata::{ClaimValue, ClaimValueData, Entity, Lang, Pid, WikiId};
|
use serde_json::{from_str, Value};
|
||||||
|
use std::{
|
||||||
|
fs::File,
|
||||||
|
io::{BufRead, BufReader},
|
||||||
|
};
|
||||||
|
use surrealdb::{Connection, Surreal};
|
||||||
|
use wikidata::Entity;
|
||||||
|
|
||||||
lazy_static! {
|
mod tables;
|
||||||
static ref WIKIDATA_LANG: String = env::var("WIKIDATA_LANG")
|
use tables::*;
|
||||||
.expect("WIKIDATA_LANG not set")
|
|
||||||
.to_string();
|
#[allow(non_camel_case_types)]
|
||||||
|
pub enum File_Format {
|
||||||
|
json,
|
||||||
|
bz2,
|
||||||
}
|
}
|
||||||
|
impl File_Format {
|
||||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
pub fn new(file: &str) -> Self {
|
||||||
pub enum ClaimData {
|
match file {
|
||||||
Thing(Thing),
|
"json" => Self::json,
|
||||||
ClaimValueData(ClaimValueData),
|
"bz2" => Self::bz2,
|
||||||
|
_ => panic!("Unknown file format"),
|
||||||
}
|
}
|
||||||
|
}
|
||||||
impl ClaimData {
|
pub fn reader(self, file: &str) -> Result<Box<dyn BufRead>, Error> {
|
||||||
fn from_cvd(cvd: ClaimValueData) -> Self {
|
let file = File::open(file)?;
|
||||||
match cvd {
|
match self {
|
||||||
ClaimValueData::Item(qid) => ClaimData::Thing(Thing {
|
File_Format::json => Ok(Box::new(BufReader::new(file))),
|
||||||
id: qid.0.into(),
|
File_Format::bz2 => Ok(Box::new(BufReader::new(MultiBzDecoder::new(file)))),
|
||||||
tb: "Entity".to_string(),
|
|
||||||
}),
|
|
||||||
ClaimValueData::Property(pid) => ClaimData::Thing(Thing {
|
|
||||||
id: pid.0.into(),
|
|
||||||
tb: "Property".to_string(),
|
|
||||||
}),
|
|
||||||
ClaimValueData::Lexeme(lid) => ClaimData::Thing(Thing {
|
|
||||||
id: lid.0.into(),
|
|
||||||
tb: "Lexeme".to_string(),
|
|
||||||
}),
|
|
||||||
_ => ClaimData::ClaimValueData(cvd),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
pub async fn create_db_entity<C: Connection>(db: &Surreal<C>, line: String) -> Result<(), Error> {
|
||||||
pub struct Claims {
|
let line = line.trim().trim_end_matches(',').to_string();
|
||||||
// Table: Claims
|
if line == "[" || line == "]" {
|
||||||
pub id: Option<Thing>,
|
return Ok(());
|
||||||
pub claims: Vec<Claim>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
let json: Value = from_str(&line)?;
|
||||||
pub struct Claim {
|
let data = Entity::from_json(json).expect("Failed to parse JSON");
|
||||||
// Table: Claim
|
|
||||||
pub id: Thing,
|
|
||||||
pub value: ClaimData,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
let (mut claims, mut data) = EntityMini::from_entity(data);
|
||||||
pub struct EntityMini {
|
|
||||||
// Table: Entity, Property, Lexeme
|
|
||||||
pub id: Option<Thing>,
|
|
||||||
pub label: String,
|
|
||||||
pub claims: Thing,
|
|
||||||
pub description: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl EntityMini {
|
let id = data.id.clone().expect("No ID");
|
||||||
pub fn from_entity(entity: Entity) -> (Claims, Self) {
|
data.id = None;
|
||||||
let thing_claim = Thing {
|
let _ = db.create::<Option<EntityMini>>(&id).await.is_err();
|
||||||
id: get_id_entity(&entity).id,
|
{
|
||||||
tb: "Claims".to_string(),
|
db.update::<Option<EntityMini>>(&id).content(data).await?;
|
||||||
};
|
};
|
||||||
|
|
||||||
(
|
let id = claims.id.clone().expect("No ID");
|
||||||
Claims {
|
claims.id = None;
|
||||||
id: Some(thing_claim.clone()),
|
let _ = db.create::<Option<Claims>>(&id).await.is_err();
|
||||||
..Self::flatten_claims(entity.claims.clone())
|
{
|
||||||
},
|
db.update::<Option<Claims>>(&id).content(claims).await?;
|
||||||
Self {
|
}
|
||||||
id: Some(get_id_entity(&entity)),
|
Ok(())
|
||||||
label: get_name(&entity),
|
|
||||||
claims: thing_claim,
|
|
||||||
description: get_description(&entity),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn flatten_claims(claims: Vec<(Pid, ClaimValue)>) -> Claims {
|
pub async fn create_db_entities<C: Connection>(
|
||||||
Claims {
|
db: &Surreal<C>,
|
||||||
id: None,
|
lines: Vec<String>,
|
||||||
claims: claims
|
pb: Option<ProgressBar>,
|
||||||
.iter()
|
) -> Result<(), Error> {
|
||||||
.flat_map(|(pid, claim_value)| {
|
let mut counter = 0;
|
||||||
let mut flattened = vec![Claim {
|
for line in lines {
|
||||||
id: Thing {
|
create_db_entity(db, line.to_string()).await?;
|
||||||
id: pid.0.into(),
|
counter += 1;
|
||||||
tb: "Property".to_string(),
|
if counter % 100 == 0 {
|
||||||
},
|
if let Some(ref p) = pb {
|
||||||
value: ClaimData::from_cvd(claim_value.data.clone()),
|
p.inc(100)
|
||||||
}];
|
|
||||||
|
|
||||||
flattened.extend(claim_value.qualifiers.iter().map(
|
|
||||||
|(qualifier_pid, qualifier_value)| Claim {
|
|
||||||
id: Thing {
|
|
||||||
id: qualifier_pid.0.into(),
|
|
||||||
tb: "Property".to_string(),
|
|
||||||
},
|
|
||||||
value: ClaimData::from_cvd(qualifier_value.clone()),
|
|
||||||
},
|
|
||||||
));
|
|
||||||
flattened
|
|
||||||
})
|
|
||||||
.collect(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn create_db_entities_threaded<C: Connection>(
|
||||||
|
db: &Surreal<C>,
|
||||||
|
reader: Box<dyn BufRead>,
|
||||||
|
pb: Option<ProgressBar>,
|
||||||
|
batch_size: usize,
|
||||||
|
batch_num: usize,
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
let mut futures = Vec::new();
|
||||||
|
let mut chunk = Vec::new();
|
||||||
|
let mut chunk_counter = 0;
|
||||||
|
|
||||||
|
for line in reader.lines() {
|
||||||
|
chunk.push(line.unwrap());
|
||||||
|
|
||||||
|
if chunk.len() >= batch_size {
|
||||||
|
let db = db.clone();
|
||||||
|
let lines = chunk.clone();
|
||||||
|
let pb = pb.clone();
|
||||||
|
|
||||||
|
futures.push(tokio::spawn(async move {
|
||||||
|
create_db_entities(&db, lines, pb).await.unwrap();
|
||||||
|
}));
|
||||||
|
chunk_counter += 1;
|
||||||
|
chunk.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
if chunk_counter >= batch_num {
|
||||||
|
join_all(futures).await;
|
||||||
|
futures = Vec::new();
|
||||||
|
chunk_counter = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn get_id_entity(entity: &Entity) -> Thing {
|
create_db_entities(db, chunk, pb).await.unwrap();
|
||||||
let (id, tb) = match entity.id {
|
join_all(futures).await;
|
||||||
WikiId::EntityId(qid) => (qid.0, "Entity".to_string()),
|
Ok(())
|
||||||
WikiId::PropertyId(pid) => (pid.0, "Property".to_string()),
|
|
||||||
WikiId::LexemeId(lid) => (lid.0, "Lexeme".to_string()),
|
|
||||||
_ => todo!("Not implemented"),
|
|
||||||
};
|
|
||||||
|
|
||||||
Thing { id: id.into(), tb }
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_name(entity: &Entity) -> String {
|
|
||||||
entity
|
|
||||||
.labels
|
|
||||||
.get(&Lang(WIKIDATA_LANG.to_string()))
|
|
||||||
.map(|label| label.to_string())
|
|
||||||
.unwrap_or_default()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_description(entity: &Entity) -> String {
|
|
||||||
entity
|
|
||||||
.descriptions
|
|
||||||
.get(&Lang(WIKIDATA_LANG.to_string()))
|
|
||||||
.cloned()
|
|
||||||
.unwrap_or_default()
|
|
||||||
}
|
}
|
||||||
|
|
137
src/utils/tables.rs
Normal file
137
src/utils/tables.rs
Normal file
|
@ -0,0 +1,137 @@
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::env;
|
||||||
|
use surrealdb::sql::Thing;
|
||||||
|
use wikidata::{ClaimValue, ClaimValueData, Entity, Lang, Pid, WikiId};
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref WIKIDATA_LANG: String = env::var("WIKIDATA_LANG")
|
||||||
|
.expect("WIKIDATA_LANG not set")
|
||||||
|
.to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||||
|
pub enum ClaimData {
|
||||||
|
Thing(Thing),
|
||||||
|
ClaimValueData(ClaimValueData),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ClaimData {
|
||||||
|
fn from_cvd(cvd: ClaimValueData) -> Self {
|
||||||
|
match cvd {
|
||||||
|
ClaimValueData::Item(qid) => ClaimData::Thing(Thing {
|
||||||
|
id: qid.0.into(),
|
||||||
|
tb: "Entity".to_string(),
|
||||||
|
}),
|
||||||
|
ClaimValueData::Property(pid) => ClaimData::Thing(Thing {
|
||||||
|
id: pid.0.into(),
|
||||||
|
tb: "Property".to_string(),
|
||||||
|
}),
|
||||||
|
ClaimValueData::Lexeme(lid) => ClaimData::Thing(Thing {
|
||||||
|
id: lid.0.into(),
|
||||||
|
tb: "Lexeme".to_string(),
|
||||||
|
}),
|
||||||
|
_ => ClaimData::ClaimValueData(cvd),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||||
|
pub struct Claims {
|
||||||
|
pub id: Option<Thing>,
|
||||||
|
pub claims: Vec<Claim>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||||
|
pub struct Claim {
|
||||||
|
pub id: Thing,
|
||||||
|
pub value: ClaimData,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||||
|
pub struct EntityMini {
|
||||||
|
// Table: Entity, Property, Lexeme
|
||||||
|
pub id: Option<Thing>,
|
||||||
|
pub label: String,
|
||||||
|
// Claims Table
|
||||||
|
pub claims: Thing,
|
||||||
|
pub description: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EntityMini {
|
||||||
|
pub fn from_entity(entity: Entity) -> (Claims, Self) {
|
||||||
|
let thing_claim = Thing {
|
||||||
|
id: get_id_entity(&entity).id,
|
||||||
|
tb: "Claims".to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
(
|
||||||
|
Claims {
|
||||||
|
id: Some(thing_claim.clone()),
|
||||||
|
..Self::flatten_claims(entity.claims.clone())
|
||||||
|
},
|
||||||
|
Self {
|
||||||
|
id: Some(get_id_entity(&entity)),
|
||||||
|
label: get_name(&entity),
|
||||||
|
claims: thing_claim,
|
||||||
|
description: get_description(&entity),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn flatten_claims(claims: Vec<(Pid, ClaimValue)>) -> Claims {
|
||||||
|
Claims {
|
||||||
|
id: None,
|
||||||
|
claims: claims
|
||||||
|
.iter()
|
||||||
|
.flat_map(|(pid, claim_value)| {
|
||||||
|
let mut flattened = vec![Claim {
|
||||||
|
id: Thing {
|
||||||
|
id: pid.0.into(),
|
||||||
|
tb: "Property".to_string(),
|
||||||
|
},
|
||||||
|
value: ClaimData::from_cvd(claim_value.data.clone()),
|
||||||
|
}];
|
||||||
|
|
||||||
|
flattened.extend(claim_value.qualifiers.iter().map(
|
||||||
|
|(qualifier_pid, qualifier_value)| Claim {
|
||||||
|
id: Thing {
|
||||||
|
id: qualifier_pid.0.into(),
|
||||||
|
tb: "Property".to_string(),
|
||||||
|
},
|
||||||
|
value: ClaimData::from_cvd(qualifier_value.clone()),
|
||||||
|
},
|
||||||
|
));
|
||||||
|
flattened
|
||||||
|
})
|
||||||
|
.collect(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_id_entity(entity: &Entity) -> Thing {
|
||||||
|
let (id, tb) = match entity.id {
|
||||||
|
WikiId::EntityId(qid) => (qid.0, "Entity".to_string()),
|
||||||
|
WikiId::PropertyId(pid) => (pid.0, "Property".to_string()),
|
||||||
|
WikiId::LexemeId(lid) => (lid.0, "Lexeme".to_string()),
|
||||||
|
_ => todo!("Not implemented"),
|
||||||
|
};
|
||||||
|
|
||||||
|
Thing { id: id.into(), tb }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_name(entity: &Entity) -> String {
|
||||||
|
entity
|
||||||
|
.labels
|
||||||
|
.get(&Lang(WIKIDATA_LANG.to_string()))
|
||||||
|
.map(|label| label.to_string())
|
||||||
|
.unwrap_or_default()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_description(entity: &Entity) -> String {
|
||||||
|
entity
|
||||||
|
.descriptions
|
||||||
|
.get(&Lang(WIKIDATA_LANG.to_string()))
|
||||||
|
.cloned()
|
||||||
|
.unwrap_or_default()
|
||||||
|
}
|
5
tests/data/Entity.json
Normal file
5
tests/data/Entity.json
Normal file
File diff suppressed because one or more lines are too long
4
tests/data/Property.json
Normal file
4
tests/data/Property.json
Normal file
File diff suppressed because one or more lines are too long
103
tests/integration.rs
Normal file
103
tests/integration.rs
Normal file
|
@ -0,0 +1,103 @@
|
||||||
|
use anyhow::{Error, Ok, Result};
|
||||||
|
use std::{env, io::BufRead};
|
||||||
|
use surrealdb::{
|
||||||
|
engine::local::{Db, Mem},
|
||||||
|
Surreal,
|
||||||
|
};
|
||||||
|
|
||||||
|
use wikidata_to_surrealdb::utils::*;
|
||||||
|
|
||||||
|
async fn inti_db() -> Result<Surreal<Db>, Error> {
|
||||||
|
env::set_var("WIKIDATA_LANG", "en");
|
||||||
|
|
||||||
|
let db = Surreal::new::<Mem>(()).await?;
|
||||||
|
db.use_ns("wikidata").use_db("wikidata").await?;
|
||||||
|
|
||||||
|
Ok(db)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn entity_query(db: &Surreal<Db>) -> Result<Option<f32>, Error> {
|
||||||
|
let x: Option<f32> = db
|
||||||
|
.query(r#"
|
||||||
|
return
|
||||||
|
(
|
||||||
|
select claims.claims[where id = Property:1113][0].value.ClaimValueData.Quantity.amount as number_of_episodes from Entity
|
||||||
|
where label = "Black Clover, season 1"
|
||||||
|
)[0].number_of_episodes;
|
||||||
|
"#)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.take(0)
|
||||||
|
.unwrap();
|
||||||
|
Ok(x)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn entity() {
|
||||||
|
let db = inti_db().await.unwrap();
|
||||||
|
let reader = File_Format::new("json")
|
||||||
|
.reader("tests/data/Entity.json")
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
for line in reader.lines() {
|
||||||
|
create_db_entity(&db, line.unwrap()).await.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
assert_eq!(51.0, entity_query(&db).await.unwrap().unwrap())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn entity_threaded() {
|
||||||
|
let db = inti_db().await.unwrap();
|
||||||
|
let reader = File_Format::new("json")
|
||||||
|
.reader("tests/data/Entity.json")
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
create_db_entities_threaded(&db, reader, None, 1000, 100)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(51.0, entity_query(&db).await.unwrap().unwrap())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn property_query(db: &Surreal<Db>) -> Result<Option<f32>, Error> {
|
||||||
|
let x: Option<f32> = db
|
||||||
|
.query(
|
||||||
|
r#"
|
||||||
|
return count(select * from Property);
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.take(0)
|
||||||
|
.unwrap();
|
||||||
|
Ok(x)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn property() {
|
||||||
|
let db = inti_db().await.unwrap();
|
||||||
|
let reader = File_Format::new("json")
|
||||||
|
.reader("tests/data/Property.json")
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
for line in reader.lines() {
|
||||||
|
create_db_entity(&db, line.unwrap()).await.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
assert_eq!(2.0, property_query(&db).await.unwrap().unwrap())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn property_threaded() {
|
||||||
|
let db = inti_db().await.unwrap();
|
||||||
|
let reader = File_Format::new("json")
|
||||||
|
.reader("tests/data/Property.json")
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
create_db_entities_threaded(&db, reader, None, 1000, 100)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(2.0, property_query(&db).await.unwrap().unwrap())
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue