mirror of
https://github.com/NexVeridian/wikidata-to-surrealdb.git
synced 2025-09-02 01:49:13 +00:00
tests
This commit is contained in:
parent
2edaeef042
commit
e37d413372
14 changed files with 525 additions and 250 deletions
109
.github/workflows/docker.yml
vendored
Normal file
109
.github/workflows/docker.yml
vendored
Normal file
|
@ -0,0 +1,109 @@
|
|||
name: Docker
|
||||
|
||||
# This workflow uses actions that are not certified by GitHub.
|
||||
# They are provided by a third-party and are governed by
|
||||
# separate terms of service, privacy policy, and support
|
||||
# documentation.
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: [nextest]
|
||||
branches: [main]
|
||||
types:
|
||||
- completed
|
||||
# schedule:
|
||||
# - cron: 0 0 * * 1
|
||||
# push:
|
||||
# branches: [ "main" ]
|
||||
# # Publish semver tags as releases.
|
||||
# tags: [ 'v*.*.*' ]
|
||||
# pull_request:
|
||||
# branches: [ "main" ]
|
||||
|
||||
env:
|
||||
# Use docker.io for Docker Hub if empty
|
||||
REGISTRY: ghcr.io
|
||||
# github.repository as <account>/<repo>
|
||||
IMAGE_NAME: ${{ github.repository }}
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
# https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#running-a-workflow-based-on-the-conclusion-of-another-workflow
|
||||
if: ${{ github.event.workflow_run.conclusion == 'success' }}
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
# This is used to complete the identity challenge
|
||||
# with sigstore/fulcio when running outside of PRs.
|
||||
id-token: write
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
# Install the cosign tool except on PR
|
||||
# https://github.com/sigstore/cosign-installer
|
||||
- name: Install cosign
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: sigstore/cosign-installer@6e04d228eb30da1757ee4e1dd75a0ec73a653e06 #v3.1.1
|
||||
with:
|
||||
cosign-release: "v2.1.1"
|
||||
|
||||
# Set up BuildKit Docker container builder to be able to build
|
||||
# multi-platform images and export cache
|
||||
# https://github.com/docker/setup-buildx-action
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@f95db51fddba0c2d1ec667646a06c2ce06100226 # v3.0.0
|
||||
|
||||
# Login against a Docker registry except on PR
|
||||
# https://github.com/docker/login-action
|
||||
- name: Log into registry ${{ env.REGISTRY }}
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GHCR_TOKEN }}
|
||||
|
||||
# Extract metadata (tags, labels) for Docker
|
||||
# https://github.com/docker/metadata-action
|
||||
- name: Extract Docker metadata
|
||||
id: meta
|
||||
uses: docker/metadata-action@96383f45573cb7f253c731d3b3ab81c87ef81934 # v5.0.0
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
||||
|
||||
# https://github.com/orgs/community/discussions/25768#discussioncomment-3249183
|
||||
- name: downcase REPO
|
||||
run: |
|
||||
echo "REPO=${GITHUB_REPOSITORY,,}" >>${GITHUB_ENV}
|
||||
|
||||
# Build and push Docker image with Buildx (don't push on PR)
|
||||
# https://github.com/docker/build-push-action
|
||||
- name: Build and push Docker image
|
||||
id: build-and-push
|
||||
uses: docker/build-push-action@0565240e2d4ab88bba5387d719585280857ece09 # v5.0.0
|
||||
with:
|
||||
context: .
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
# tags: ${{ steps.meta.outputs.tags }}
|
||||
# tags: ${{ env.REGISTRY }}/${{ github.repository }}:latest
|
||||
tags: ${{ env.REGISTRY }}/${{ env.REPO }}:latest
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
# # Sign the resulting Docker image digest except on PRs.
|
||||
# # This will only write to the public Rekor transparency log when the Docker
|
||||
# # repository is public to avoid leaking data. If you would like to publish
|
||||
# # transparency data even for private images, pass --force to cosign below.
|
||||
# # https://github.com/sigstore/cosign
|
||||
# - name: Sign the published Docker image
|
||||
# if: ${{ github.event_name != 'pull_request' }}
|
||||
# env:
|
||||
# # https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable
|
||||
# TAGS: ${{ steps.meta.outputs.tags }}
|
||||
# DIGEST: ${{ steps.build-and-push.outputs.digest }}
|
||||
# # This step uses the identity token to provision an ephemeral certificate
|
||||
# # against the sigstore community Fulcio instance.
|
||||
# run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST}
|
45
.github/workflows/nextest.yml
vendored
Normal file
45
.github/workflows/nextest.yml
vendored
Normal file
|
@ -0,0 +1,45 @@
|
|||
# https://github.com/nextest-rs/reuse-build-partition-example
|
||||
# https://keliris.dev/articles/setup-rust-github-actions
|
||||
|
||||
name: nextest
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: [main]
|
||||
push:
|
||||
branches: [main]
|
||||
schedule:
|
||||
- cron: 0 0 * * 1
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
|
||||
jobs:
|
||||
run-tests:
|
||||
name: run tests
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
# - uses: rui314/setup-mold@v1
|
||||
- name: install mold
|
||||
run: sudo apt-get install -y musl-tools musl-dev libssl-dev clang mold
|
||||
# https://github.com/moonrepo/setup-rust
|
||||
- uses: moonrepo/setup-rust@v1
|
||||
with:
|
||||
bins: cargo-nextest
|
||||
- name: Run tests
|
||||
run: cargo nextest run -E "all() - test(get_api) - kind(bin)"
|
||||
|
||||
clippy:
|
||||
name: clippy
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
# - uses: rui314/setup-mold@v1
|
||||
- name: install mold
|
||||
run: sudo apt-get install -y musl-tools musl-dev libssl-dev clang mold
|
||||
- uses: moonrepo/setup-rust@v1
|
||||
with:
|
||||
components: clippy
|
||||
- name: clippy
|
||||
run: cargo clippy
|
|
@ -15,3 +15,6 @@ wikidata = "0.3.1"
|
|||
bzip2 = { version = "0.4", features = ["tokio"] }
|
||||
lazy_static = "1.4"
|
||||
indicatif = "0.17"
|
||||
|
||||
[dev-dependencies]
|
||||
surrealdb = { version = "1.0", features = ["kv-mem"] }
|
||||
|
|
23
README.md
23
README.md
|
@ -1,5 +1,5 @@
|
|||
# Wikidata to SurrealDB
|
||||
A tool for converting Wikidata dumps to a [SurrealDB](https://surrealdb.com/) database. Either From a bz2 or json file format.
|
||||
A tool for converting Wikidata dumps to a [SurrealDB](https://surrealdb.com/) database. Either From a bz2 or json file.
|
||||
|
||||
# Getting The Data
|
||||
https://www.wikidata.org/wiki/Wikidata:Data_access
|
||||
|
@ -50,20 +50,21 @@ THREADED_REQUESTS=true
|
|||
# How to Query
|
||||
## See [Useful queries.md](./Useful%20queries.md)
|
||||
|
||||
# Table Layout
|
||||
## Thing
|
||||
# Table Schema
|
||||
## SurrealDB Thing
|
||||
```rust
|
||||
pub struct Thing {
|
||||
pub table: String,
|
||||
pub id: Id,
|
||||
pub id: Id, // i64
|
||||
}
|
||||
```
|
||||
|
||||
## Table: Entity, Property, Lexeme
|
||||
## Tables: Entity, Property, Lexeme
|
||||
```rust
|
||||
pub struct EntityMini {
|
||||
pub id: Option<Thing>,
|
||||
pub label: String,
|
||||
// Claims Table
|
||||
pub claims: Thing,
|
||||
pub description: String,
|
||||
}
|
||||
|
@ -71,27 +72,21 @@ pub struct EntityMini {
|
|||
|
||||
## Table: Claims
|
||||
```rust
|
||||
pub struct Claims {
|
||||
pub id: Option<Thing>,
|
||||
pub claims: Vec<Claim>,
|
||||
}
|
||||
```
|
||||
|
||||
## Table: Claim
|
||||
```rust
|
||||
pub struct Claim {
|
||||
pub id: Thing,
|
||||
pub value: ClaimData,
|
||||
}
|
||||
```
|
||||
|
||||
## ClaimData
|
||||
### ClaimData
|
||||
```rust
|
||||
pub enum ClaimData {
|
||||
// Entity, Property, Lexeme Tables
|
||||
Thing(Thing),
|
||||
ClaimValueData(ClaimValueData),
|
||||
}
|
||||
```
|
||||
#### [Docs for ClaimValueData](https://docs.rs/wikidata/0.3.1/wikidata/enum.ClaimValueData.html)
|
||||
|
||||
# Similar Projects
|
||||
- [wd2duckdb](https://github.com/weso/wd2duckdb)
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
```
|
||||
let $number_of_episodes = (select claims.claims[where id = Property:1113][0].value.ClaimValueData.Quantity.amount as number_of_episodes from Entity where label = "Black Clover, season 1")[0].number_of_episodes;
|
||||
|
||||
return $number_of_episodes[0].number_of_episodes;
|
||||
return $number_of_episodes;
|
||||
|
||||
update Entity SET number_of_episodes=$number_of_episodes where label = "Black Clover, season 1";
|
||||
```
|
||||
|
|
|
@ -22,7 +22,7 @@ services:
|
|||
|
||||
wikidata-to-surrealdb:
|
||||
container_name: wikidata-to-surrealdb
|
||||
image: ghcr.io/nexveridian/ark-invest-api-rust-data:latest
|
||||
image: ghcr.io/nexveridian/wikidata-to-surrealdb:latest
|
||||
env_file:
|
||||
- .env
|
||||
restart: no
|
||||
|
|
2
src/lib.rs
Normal file
2
src/lib.rs
Normal file
|
@ -0,0 +1,2 @@
|
|||
pub mod utils;
|
||||
pub use utils::*;
|
115
src/main.rs
115
src/main.rs
|
@ -1,24 +1,8 @@
|
|||
use anyhow::{Error, Ok, Result};
|
||||
use bzip2::read::MultiBzDecoder;
|
||||
use futures::future::join_all;
|
||||
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
||||
use lazy_static::lazy_static;
|
||||
use serde_json::{from_str, Value};
|
||||
use std::{
|
||||
env,
|
||||
fmt::Write,
|
||||
fs::File,
|
||||
io::{BufRead, BufReader},
|
||||
thread,
|
||||
time::Duration,
|
||||
};
|
||||
use surrealdb::{
|
||||
engine::remote::ws::{Client, Ws},
|
||||
opt::auth::Root,
|
||||
Surreal,
|
||||
};
|
||||
use wikidata::Entity;
|
||||
|
||||
use std::{env, fmt::Write, io::BufRead, thread, time::Duration};
|
||||
use surrealdb::{engine::remote::ws::Ws, opt::auth::Root, Surreal};
|
||||
mod utils;
|
||||
use utils::*;
|
||||
|
||||
|
@ -32,71 +16,6 @@ lazy_static! {
|
|||
static ref THREADED_REQUESTS: bool = env::var("THREADED_REQUESTS").expect("THREADED_REQUESTS not set").parse().expect("Failed to parse THREADED_REQUESTS");
|
||||
}
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
enum File_Format {
|
||||
json,
|
||||
bz2,
|
||||
}
|
||||
impl File_Format {
|
||||
fn new(file: &str) -> Self {
|
||||
match file {
|
||||
"json" => Self::json,
|
||||
"bz2" => Self::bz2,
|
||||
_ => panic!("Unknown file format"),
|
||||
}
|
||||
}
|
||||
fn reader(self, file: &str) -> Result<Box<dyn BufRead>, Error> {
|
||||
let file = File::open(file)?;
|
||||
match self {
|
||||
File_Format::json => Ok(Box::new(BufReader::new(file))),
|
||||
File_Format::bz2 => Ok(Box::new(BufReader::new(MultiBzDecoder::new(file)))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn create_db_entity(db: &Surreal<Client>, line: String) -> Result<(), Error> {
|
||||
let line = line.trim().trim_end_matches(',').to_string();
|
||||
if line == "[" || line == "]" {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let json: Value = from_str(&line)?;
|
||||
let data = Entity::from_json(json).expect("Failed to parse JSON");
|
||||
|
||||
let (mut claims, mut data) = EntityMini::from_entity(data);
|
||||
|
||||
let id = data.id.clone().expect("No ID");
|
||||
data.id = None;
|
||||
let _ = db.create::<Option<EntityMini>>(&id).await.is_err();
|
||||
{
|
||||
db.update::<Option<EntityMini>>(&id).content(data).await?;
|
||||
};
|
||||
|
||||
let id = claims.id.clone().expect("No ID");
|
||||
claims.id = None;
|
||||
let _ = db.create::<Option<Claims>>(&id).await.is_err();
|
||||
{
|
||||
db.update::<Option<Claims>>(&id).content(claims).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_db_entities(
|
||||
db: &Surreal<Client>,
|
||||
lines: Vec<String>,
|
||||
pb: ProgressBar,
|
||||
) -> Result<(), Error> {
|
||||
let mut counter = 0;
|
||||
for line in lines {
|
||||
create_db_entity(db, line.to_string()).await?;
|
||||
counter += 1;
|
||||
if counter % 100 == 0 {
|
||||
pb.inc(100);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Error> {
|
||||
thread::sleep(Duration::from_secs(10));
|
||||
|
@ -136,35 +55,7 @@ async fn main() -> Result<(), Error> {
|
|||
}
|
||||
}
|
||||
} else {
|
||||
let mut futures = Vec::new();
|
||||
let mut chunk = Vec::new();
|
||||
let mut chunk_counter = 0;
|
||||
const BATCH_SIZE: usize = 1000;
|
||||
const BATCH_NUM: usize = 100;
|
||||
|
||||
for line in reader.lines() {
|
||||
chunk.push(line.unwrap());
|
||||
|
||||
if chunk.len() >= BATCH_SIZE {
|
||||
let db = db.clone();
|
||||
let lines = chunk.clone();
|
||||
let pb = pb.clone();
|
||||
|
||||
futures.push(tokio::spawn(async move {
|
||||
create_db_entities(&db, lines, pb).await.unwrap();
|
||||
}));
|
||||
chunk_counter += 1;
|
||||
chunk.clear();
|
||||
}
|
||||
|
||||
if chunk_counter >= BATCH_NUM {
|
||||
join_all(futures).await;
|
||||
futures = Vec::new();
|
||||
chunk_counter = 0;
|
||||
}
|
||||
}
|
||||
|
||||
join_all(futures).await;
|
||||
create_db_entities_threaded(&db, reader, Some(pb.clone()), 1000, 100).await?;
|
||||
}
|
||||
|
||||
pb.finish();
|
||||
|
|
215
src/utils.rs
215
src/utils.rs
|
@ -1,138 +1,119 @@
|
|||
use lazy_static::lazy_static;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::env;
|
||||
use surrealdb::sql::Thing;
|
||||
use wikidata::{ClaimValue, ClaimValueData, Entity, Lang, Pid, WikiId};
|
||||
use anyhow::{Error, Ok, Result};
|
||||
use bzip2::read::MultiBzDecoder;
|
||||
use futures::future::join_all;
|
||||
use indicatif::ProgressBar;
|
||||
use serde_json::{from_str, Value};
|
||||
use std::{
|
||||
fs::File,
|
||||
io::{BufRead, BufReader},
|
||||
};
|
||||
use surrealdb::{Connection, Surreal};
|
||||
use wikidata::Entity;
|
||||
|
||||
lazy_static! {
|
||||
static ref WIKIDATA_LANG: String = env::var("WIKIDATA_LANG")
|
||||
.expect("WIKIDATA_LANG not set")
|
||||
.to_string();
|
||||
mod tables;
|
||||
use tables::*;
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum File_Format {
|
||||
json,
|
||||
bz2,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub enum ClaimData {
|
||||
Thing(Thing),
|
||||
ClaimValueData(ClaimValueData),
|
||||
}
|
||||
|
||||
impl ClaimData {
|
||||
fn from_cvd(cvd: ClaimValueData) -> Self {
|
||||
match cvd {
|
||||
ClaimValueData::Item(qid) => ClaimData::Thing(Thing {
|
||||
id: qid.0.into(),
|
||||
tb: "Entity".to_string(),
|
||||
}),
|
||||
ClaimValueData::Property(pid) => ClaimData::Thing(Thing {
|
||||
id: pid.0.into(),
|
||||
tb: "Property".to_string(),
|
||||
}),
|
||||
ClaimValueData::Lexeme(lid) => ClaimData::Thing(Thing {
|
||||
id: lid.0.into(),
|
||||
tb: "Lexeme".to_string(),
|
||||
}),
|
||||
_ => ClaimData::ClaimValueData(cvd),
|
||||
impl File_Format {
|
||||
pub fn new(file: &str) -> Self {
|
||||
match file {
|
||||
"json" => Self::json,
|
||||
"bz2" => Self::bz2,
|
||||
_ => panic!("Unknown file format"),
|
||||
}
|
||||
}
|
||||
pub fn reader(self, file: &str) -> Result<Box<dyn BufRead>, Error> {
|
||||
let file = File::open(file)?;
|
||||
match self {
|
||||
File_Format::json => Ok(Box::new(BufReader::new(file))),
|
||||
File_Format::bz2 => Ok(Box::new(BufReader::new(MultiBzDecoder::new(file)))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Claims {
|
||||
// Table: Claims
|
||||
pub id: Option<Thing>,
|
||||
pub claims: Vec<Claim>,
|
||||
}
|
||||
pub async fn create_db_entity<C: Connection>(db: &Surreal<C>, line: String) -> Result<(), Error> {
|
||||
let line = line.trim().trim_end_matches(',').to_string();
|
||||
if line == "[" || line == "]" {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Claim {
|
||||
// Table: Claim
|
||||
pub id: Thing,
|
||||
pub value: ClaimData,
|
||||
}
|
||||
let json: Value = from_str(&line)?;
|
||||
let data = Entity::from_json(json).expect("Failed to parse JSON");
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub struct EntityMini {
|
||||
// Table: Entity, Property, Lexeme
|
||||
pub id: Option<Thing>,
|
||||
pub label: String,
|
||||
pub claims: Thing,
|
||||
pub description: String,
|
||||
}
|
||||
let (mut claims, mut data) = EntityMini::from_entity(data);
|
||||
|
||||
impl EntityMini {
|
||||
pub fn from_entity(entity: Entity) -> (Claims, Self) {
|
||||
let thing_claim = Thing {
|
||||
id: get_id_entity(&entity).id,
|
||||
tb: "Claims".to_string(),
|
||||
let id = data.id.clone().expect("No ID");
|
||||
data.id = None;
|
||||
let _ = db.create::<Option<EntityMini>>(&id).await.is_err();
|
||||
{
|
||||
db.update::<Option<EntityMini>>(&id).content(data).await?;
|
||||
};
|
||||
|
||||
(
|
||||
Claims {
|
||||
id: Some(thing_claim.clone()),
|
||||
..Self::flatten_claims(entity.claims.clone())
|
||||
},
|
||||
Self {
|
||||
id: Some(get_id_entity(&entity)),
|
||||
label: get_name(&entity),
|
||||
claims: thing_claim,
|
||||
description: get_description(&entity),
|
||||
},
|
||||
)
|
||||
let id = claims.id.clone().expect("No ID");
|
||||
claims.id = None;
|
||||
let _ = db.create::<Option<Claims>>(&id).await.is_err();
|
||||
{
|
||||
db.update::<Option<Claims>>(&id).content(claims).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flatten_claims(claims: Vec<(Pid, ClaimValue)>) -> Claims {
|
||||
Claims {
|
||||
id: None,
|
||||
claims: claims
|
||||
.iter()
|
||||
.flat_map(|(pid, claim_value)| {
|
||||
let mut flattened = vec![Claim {
|
||||
id: Thing {
|
||||
id: pid.0.into(),
|
||||
tb: "Property".to_string(),
|
||||
},
|
||||
value: ClaimData::from_cvd(claim_value.data.clone()),
|
||||
}];
|
||||
|
||||
flattened.extend(claim_value.qualifiers.iter().map(
|
||||
|(qualifier_pid, qualifier_value)| Claim {
|
||||
id: Thing {
|
||||
id: qualifier_pid.0.into(),
|
||||
tb: "Property".to_string(),
|
||||
},
|
||||
value: ClaimData::from_cvd(qualifier_value.clone()),
|
||||
},
|
||||
));
|
||||
flattened
|
||||
})
|
||||
.collect(),
|
||||
pub async fn create_db_entities<C: Connection>(
|
||||
db: &Surreal<C>,
|
||||
lines: Vec<String>,
|
||||
pb: Option<ProgressBar>,
|
||||
) -> Result<(), Error> {
|
||||
let mut counter = 0;
|
||||
for line in lines {
|
||||
create_db_entity(db, line.to_string()).await?;
|
||||
counter += 1;
|
||||
if counter % 100 == 0 {
|
||||
if let Some(ref p) = pb {
|
||||
p.inc(100)
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_id_entity(entity: &Entity) -> Thing {
|
||||
let (id, tb) = match entity.id {
|
||||
WikiId::EntityId(qid) => (qid.0, "Entity".to_string()),
|
||||
WikiId::PropertyId(pid) => (pid.0, "Property".to_string()),
|
||||
WikiId::LexemeId(lid) => (lid.0, "Lexeme".to_string()),
|
||||
_ => todo!("Not implemented"),
|
||||
};
|
||||
pub async fn create_db_entities_threaded<C: Connection>(
|
||||
db: &Surreal<C>,
|
||||
reader: Box<dyn BufRead>,
|
||||
pb: Option<ProgressBar>,
|
||||
batch_size: usize,
|
||||
batch_num: usize,
|
||||
) -> Result<(), Error> {
|
||||
let mut futures = Vec::new();
|
||||
let mut chunk = Vec::new();
|
||||
let mut chunk_counter = 0;
|
||||
|
||||
Thing { id: id.into(), tb }
|
||||
}
|
||||
for line in reader.lines() {
|
||||
chunk.push(line.unwrap());
|
||||
|
||||
fn get_name(entity: &Entity) -> String {
|
||||
entity
|
||||
.labels
|
||||
.get(&Lang(WIKIDATA_LANG.to_string()))
|
||||
.map(|label| label.to_string())
|
||||
.unwrap_or_default()
|
||||
}
|
||||
if chunk.len() >= batch_size {
|
||||
let db = db.clone();
|
||||
let lines = chunk.clone();
|
||||
let pb = pb.clone();
|
||||
|
||||
fn get_description(entity: &Entity) -> String {
|
||||
entity
|
||||
.descriptions
|
||||
.get(&Lang(WIKIDATA_LANG.to_string()))
|
||||
.cloned()
|
||||
.unwrap_or_default()
|
||||
futures.push(tokio::spawn(async move {
|
||||
create_db_entities(&db, lines, pb).await.unwrap();
|
||||
}));
|
||||
chunk_counter += 1;
|
||||
chunk.clear();
|
||||
}
|
||||
|
||||
if chunk_counter >= batch_num {
|
||||
join_all(futures).await;
|
||||
futures = Vec::new();
|
||||
chunk_counter = 0;
|
||||
}
|
||||
}
|
||||
|
||||
create_db_entities(db, chunk, pb).await.unwrap();
|
||||
join_all(futures).await;
|
||||
Ok(())
|
||||
}
|
||||
|
|
137
src/utils/tables.rs
Normal file
137
src/utils/tables.rs
Normal file
|
@ -0,0 +1,137 @@
|
|||
use lazy_static::lazy_static;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::env;
|
||||
use surrealdb::sql::Thing;
|
||||
use wikidata::{ClaimValue, ClaimValueData, Entity, Lang, Pid, WikiId};
|
||||
|
||||
lazy_static! {
|
||||
static ref WIKIDATA_LANG: String = env::var("WIKIDATA_LANG")
|
||||
.expect("WIKIDATA_LANG not set")
|
||||
.to_string();
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub enum ClaimData {
|
||||
Thing(Thing),
|
||||
ClaimValueData(ClaimValueData),
|
||||
}
|
||||
|
||||
impl ClaimData {
|
||||
fn from_cvd(cvd: ClaimValueData) -> Self {
|
||||
match cvd {
|
||||
ClaimValueData::Item(qid) => ClaimData::Thing(Thing {
|
||||
id: qid.0.into(),
|
||||
tb: "Entity".to_string(),
|
||||
}),
|
||||
ClaimValueData::Property(pid) => ClaimData::Thing(Thing {
|
||||
id: pid.0.into(),
|
||||
tb: "Property".to_string(),
|
||||
}),
|
||||
ClaimValueData::Lexeme(lid) => ClaimData::Thing(Thing {
|
||||
id: lid.0.into(),
|
||||
tb: "Lexeme".to_string(),
|
||||
}),
|
||||
_ => ClaimData::ClaimValueData(cvd),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Claims {
|
||||
pub id: Option<Thing>,
|
||||
pub claims: Vec<Claim>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Claim {
|
||||
pub id: Thing,
|
||||
pub value: ClaimData,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub struct EntityMini {
|
||||
// Table: Entity, Property, Lexeme
|
||||
pub id: Option<Thing>,
|
||||
pub label: String,
|
||||
// Claims Table
|
||||
pub claims: Thing,
|
||||
pub description: String,
|
||||
}
|
||||
|
||||
impl EntityMini {
|
||||
pub fn from_entity(entity: Entity) -> (Claims, Self) {
|
||||
let thing_claim = Thing {
|
||||
id: get_id_entity(&entity).id,
|
||||
tb: "Claims".to_string(),
|
||||
};
|
||||
|
||||
(
|
||||
Claims {
|
||||
id: Some(thing_claim.clone()),
|
||||
..Self::flatten_claims(entity.claims.clone())
|
||||
},
|
||||
Self {
|
||||
id: Some(get_id_entity(&entity)),
|
||||
label: get_name(&entity),
|
||||
claims: thing_claim,
|
||||
description: get_description(&entity),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn flatten_claims(claims: Vec<(Pid, ClaimValue)>) -> Claims {
|
||||
Claims {
|
||||
id: None,
|
||||
claims: claims
|
||||
.iter()
|
||||
.flat_map(|(pid, claim_value)| {
|
||||
let mut flattened = vec![Claim {
|
||||
id: Thing {
|
||||
id: pid.0.into(),
|
||||
tb: "Property".to_string(),
|
||||
},
|
||||
value: ClaimData::from_cvd(claim_value.data.clone()),
|
||||
}];
|
||||
|
||||
flattened.extend(claim_value.qualifiers.iter().map(
|
||||
|(qualifier_pid, qualifier_value)| Claim {
|
||||
id: Thing {
|
||||
id: qualifier_pid.0.into(),
|
||||
tb: "Property".to_string(),
|
||||
},
|
||||
value: ClaimData::from_cvd(qualifier_value.clone()),
|
||||
},
|
||||
));
|
||||
flattened
|
||||
})
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_id_entity(entity: &Entity) -> Thing {
|
||||
let (id, tb) = match entity.id {
|
||||
WikiId::EntityId(qid) => (qid.0, "Entity".to_string()),
|
||||
WikiId::PropertyId(pid) => (pid.0, "Property".to_string()),
|
||||
WikiId::LexemeId(lid) => (lid.0, "Lexeme".to_string()),
|
||||
_ => todo!("Not implemented"),
|
||||
};
|
||||
|
||||
Thing { id: id.into(), tb }
|
||||
}
|
||||
|
||||
fn get_name(entity: &Entity) -> String {
|
||||
entity
|
||||
.labels
|
||||
.get(&Lang(WIKIDATA_LANG.to_string()))
|
||||
.map(|label| label.to_string())
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
fn get_description(entity: &Entity) -> String {
|
||||
entity
|
||||
.descriptions
|
||||
.get(&Lang(WIKIDATA_LANG.to_string()))
|
||||
.cloned()
|
||||
.unwrap_or_default()
|
||||
}
|
5
tests/data/Entity.json
Normal file
5
tests/data/Entity.json
Normal file
File diff suppressed because one or more lines are too long
4
tests/data/Property.json
Normal file
4
tests/data/Property.json
Normal file
File diff suppressed because one or more lines are too long
103
tests/integration.rs
Normal file
103
tests/integration.rs
Normal file
|
@ -0,0 +1,103 @@
|
|||
use anyhow::{Error, Ok, Result};
|
||||
use std::{env, io::BufRead};
|
||||
use surrealdb::{
|
||||
engine::local::{Db, Mem},
|
||||
Surreal,
|
||||
};
|
||||
|
||||
use wikidata_to_surrealdb::utils::*;
|
||||
|
||||
async fn inti_db() -> Result<Surreal<Db>, Error> {
|
||||
env::set_var("WIKIDATA_LANG", "en");
|
||||
|
||||
let db = Surreal::new::<Mem>(()).await?;
|
||||
db.use_ns("wikidata").use_db("wikidata").await?;
|
||||
|
||||
Ok(db)
|
||||
}
|
||||
|
||||
async fn entity_query(db: &Surreal<Db>) -> Result<Option<f32>, Error> {
|
||||
let x: Option<f32> = db
|
||||
.query(r#"
|
||||
return
|
||||
(
|
||||
select claims.claims[where id = Property:1113][0].value.ClaimValueData.Quantity.amount as number_of_episodes from Entity
|
||||
where label = "Black Clover, season 1"
|
||||
)[0].number_of_episodes;
|
||||
"#)
|
||||
.await
|
||||
.unwrap()
|
||||
.take(0)
|
||||
.unwrap();
|
||||
Ok(x)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn entity() {
|
||||
let db = inti_db().await.unwrap();
|
||||
let reader = File_Format::new("json")
|
||||
.reader("tests/data/Entity.json")
|
||||
.unwrap();
|
||||
|
||||
for line in reader.lines() {
|
||||
create_db_entity(&db, line.unwrap()).await.unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(51.0, entity_query(&db).await.unwrap().unwrap())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn entity_threaded() {
|
||||
let db = inti_db().await.unwrap();
|
||||
let reader = File_Format::new("json")
|
||||
.reader("tests/data/Entity.json")
|
||||
.unwrap();
|
||||
|
||||
create_db_entities_threaded(&db, reader, None, 1000, 100)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(51.0, entity_query(&db).await.unwrap().unwrap())
|
||||
}
|
||||
|
||||
async fn property_query(db: &Surreal<Db>) -> Result<Option<f32>, Error> {
|
||||
let x: Option<f32> = db
|
||||
.query(
|
||||
r#"
|
||||
return count(select * from Property);
|
||||
"#,
|
||||
)
|
||||
.await
|
||||
.unwrap()
|
||||
.take(0)
|
||||
.unwrap();
|
||||
Ok(x)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn property() {
|
||||
let db = inti_db().await.unwrap();
|
||||
let reader = File_Format::new("json")
|
||||
.reader("tests/data/Property.json")
|
||||
.unwrap();
|
||||
|
||||
for line in reader.lines() {
|
||||
create_db_entity(&db, line.unwrap()).await.unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(2.0, property_query(&db).await.unwrap().unwrap())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn property_threaded() {
|
||||
let db = inti_db().await.unwrap();
|
||||
let reader = File_Format::new("json")
|
||||
.reader("tests/data/Property.json")
|
||||
.unwrap();
|
||||
|
||||
create_db_entities_threaded(&db, reader, None, 1000, 100)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(2.0, property_query(&db).await.unwrap().unwrap())
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue