mirror of
https://github.com/NexVeridian/wikidata-to-surrealdb.git
synced 2025-09-02 09:59:13 +00:00
tests
This commit is contained in:
parent
2edaeef042
commit
e37d413372
14 changed files with 525 additions and 250 deletions
115
src/main.rs
115
src/main.rs
|
@ -1,24 +1,8 @@
|
|||
use anyhow::{Error, Ok, Result};
|
||||
use bzip2::read::MultiBzDecoder;
|
||||
use futures::future::join_all;
|
||||
use indicatif::{ProgressBar, ProgressState, ProgressStyle};
|
||||
use lazy_static::lazy_static;
|
||||
use serde_json::{from_str, Value};
|
||||
use std::{
|
||||
env,
|
||||
fmt::Write,
|
||||
fs::File,
|
||||
io::{BufRead, BufReader},
|
||||
thread,
|
||||
time::Duration,
|
||||
};
|
||||
use surrealdb::{
|
||||
engine::remote::ws::{Client, Ws},
|
||||
opt::auth::Root,
|
||||
Surreal,
|
||||
};
|
||||
use wikidata::Entity;
|
||||
|
||||
use std::{env, fmt::Write, io::BufRead, thread, time::Duration};
|
||||
use surrealdb::{engine::remote::ws::Ws, opt::auth::Root, Surreal};
|
||||
mod utils;
|
||||
use utils::*;
|
||||
|
||||
|
@ -32,71 +16,6 @@ lazy_static! {
|
|||
static ref THREADED_REQUESTS: bool = env::var("THREADED_REQUESTS").expect("THREADED_REQUESTS not set").parse().expect("Failed to parse THREADED_REQUESTS");
|
||||
}
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
enum File_Format {
|
||||
json,
|
||||
bz2,
|
||||
}
|
||||
impl File_Format {
|
||||
fn new(file: &str) -> Self {
|
||||
match file {
|
||||
"json" => Self::json,
|
||||
"bz2" => Self::bz2,
|
||||
_ => panic!("Unknown file format"),
|
||||
}
|
||||
}
|
||||
fn reader(self, file: &str) -> Result<Box<dyn BufRead>, Error> {
|
||||
let file = File::open(file)?;
|
||||
match self {
|
||||
File_Format::json => Ok(Box::new(BufReader::new(file))),
|
||||
File_Format::bz2 => Ok(Box::new(BufReader::new(MultiBzDecoder::new(file)))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn create_db_entity(db: &Surreal<Client>, line: String) -> Result<(), Error> {
|
||||
let line = line.trim().trim_end_matches(',').to_string();
|
||||
if line == "[" || line == "]" {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let json: Value = from_str(&line)?;
|
||||
let data = Entity::from_json(json).expect("Failed to parse JSON");
|
||||
|
||||
let (mut claims, mut data) = EntityMini::from_entity(data);
|
||||
|
||||
let id = data.id.clone().expect("No ID");
|
||||
data.id = None;
|
||||
let _ = db.create::<Option<EntityMini>>(&id).await.is_err();
|
||||
{
|
||||
db.update::<Option<EntityMini>>(&id).content(data).await?;
|
||||
};
|
||||
|
||||
let id = claims.id.clone().expect("No ID");
|
||||
claims.id = None;
|
||||
let _ = db.create::<Option<Claims>>(&id).await.is_err();
|
||||
{
|
||||
db.update::<Option<Claims>>(&id).content(claims).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_db_entities(
|
||||
db: &Surreal<Client>,
|
||||
lines: Vec<String>,
|
||||
pb: ProgressBar,
|
||||
) -> Result<(), Error> {
|
||||
let mut counter = 0;
|
||||
for line in lines {
|
||||
create_db_entity(db, line.to_string()).await?;
|
||||
counter += 1;
|
||||
if counter % 100 == 0 {
|
||||
pb.inc(100);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Error> {
|
||||
thread::sleep(Duration::from_secs(10));
|
||||
|
@ -136,35 +55,7 @@ async fn main() -> Result<(), Error> {
|
|||
}
|
||||
}
|
||||
} else {
|
||||
let mut futures = Vec::new();
|
||||
let mut chunk = Vec::new();
|
||||
let mut chunk_counter = 0;
|
||||
const BATCH_SIZE: usize = 1000;
|
||||
const BATCH_NUM: usize = 100;
|
||||
|
||||
for line in reader.lines() {
|
||||
chunk.push(line.unwrap());
|
||||
|
||||
if chunk.len() >= BATCH_SIZE {
|
||||
let db = db.clone();
|
||||
let lines = chunk.clone();
|
||||
let pb = pb.clone();
|
||||
|
||||
futures.push(tokio::spawn(async move {
|
||||
create_db_entities(&db, lines, pb).await.unwrap();
|
||||
}));
|
||||
chunk_counter += 1;
|
||||
chunk.clear();
|
||||
}
|
||||
|
||||
if chunk_counter >= BATCH_NUM {
|
||||
join_all(futures).await;
|
||||
futures = Vec::new();
|
||||
chunk_counter = 0;
|
||||
}
|
||||
}
|
||||
|
||||
join_all(futures).await;
|
||||
create_db_entities_threaded(&db, reader, Some(pb.clone()), 1000, 100).await?;
|
||||
}
|
||||
|
||||
pb.finish();
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue