mirror of
https://github.com/NexVeridian/wikidata-to-surrealdb.git
synced 2025-09-02 09:59:13 +00:00
refactor: main
This commit is contained in:
parent
b885315cd7
commit
bb9967ced6
3 changed files with 25 additions and 88 deletions
12
README.md
12
README.md
|
@ -53,17 +53,17 @@ WIKIDATA_FILE_NAME=data/latest-all.json.bz2
|
||||||
WIKIDATA_DB_PORT=surrealdb:8000
|
WIKIDATA_DB_PORT=surrealdb:8000
|
||||||
# true=overwrite existing data, false=skip if already exists
|
# true=overwrite existing data, false=skip if already exists
|
||||||
OVERWRITE_DB=false
|
OVERWRITE_DB=false
|
||||||
CREATE_MODE=ThreadedSingle
|
CREATE_VERSION=Bulk
|
||||||
#FILTER_PATH=../filter.surql
|
#FILTER_PATH=../filter.surql
|
||||||
```
|
```
|
||||||
|
|
||||||
Env string CREATE_MODE must be in the enum CreateMode
|
Env string CREATE_VERSION must be in the enum CREATE_VERSION
|
||||||
```rust
|
```rust
|
||||||
pub enum CreateMode {
|
pub enum CreateVersion {
|
||||||
Single,
|
Single,
|
||||||
ThreadedSingle,
|
#[default]
|
||||||
ThreadedBulk,
|
Bulk,
|
||||||
// must create a filter.surql file in the root directory
|
/// must create a filter.surql file in the root directory
|
||||||
BulkFilter,
|
BulkFilter,
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
96
src/main.rs
96
src/main.rs
|
@ -1,6 +1,6 @@
|
||||||
use anyhow::{Error, Ok, Result};
|
use anyhow::{Error, Ok, Result};
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use std::{env, io::BufRead};
|
use std::env;
|
||||||
use surrealdb::{engine::remote::ws::Client, Surreal};
|
use surrealdb::{engine::remote::ws::Client, Surreal};
|
||||||
use tokio::time::{sleep, Duration};
|
use tokio::time::{sleep, Duration};
|
||||||
|
|
||||||
|
@ -12,96 +12,32 @@ lazy_static! {
|
||||||
env::var("WIKIDATA_FILE_FORMAT").expect("FILE_FORMAT not set");
|
env::var("WIKIDATA_FILE_FORMAT").expect("FILE_FORMAT not set");
|
||||||
static ref WIKIDATA_FILE_NAME: String =
|
static ref WIKIDATA_FILE_NAME: String =
|
||||||
env::var("WIKIDATA_FILE_NAME").expect("FILE_NAME not set");
|
env::var("WIKIDATA_FILE_NAME").expect("FILE_NAME not set");
|
||||||
static ref CREATE_MODE: CreateMode = match env::var("CREATE_MODE")
|
static ref CREATE_VERSION: CreateVersion = match env::var("CREATE_VERSION")
|
||||||
.expect("CREATE_MODE not set")
|
.expect("CREATE_VERSION not set")
|
||||||
.as_str()
|
.as_str()
|
||||||
{
|
{
|
||||||
"Single" => CreateMode::Single,
|
"Single" => CreateVersion::Single,
|
||||||
"ThreadedSingle" => CreateMode::ThreadedSingle,
|
"Bulk" => CreateVersion::Bulk,
|
||||||
"ThreadedBulk" => CreateMode::ThreadedBulk,
|
"BulkFilter" => CreateVersion::BulkFilter,
|
||||||
"ThreadedBulkFilter" => CreateMode::ThreadedBulkFilter,
|
_ => panic!("Unknown CREATE_VERSION"),
|
||||||
_ => panic!("Unknown CREATE_MODE"),
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
|
||||||
pub enum CreateMode {
|
|
||||||
Single,
|
|
||||||
ThreadedSingle,
|
|
||||||
ThreadedBulk,
|
|
||||||
// must create a filter.surql file in the root directory
|
|
||||||
ThreadedBulkFilter,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<(), Error> {
|
async fn main() -> Result<(), Error> {
|
||||||
sleep(Duration::from_secs(10)).await;
|
sleep(Duration::from_secs(10)).await;
|
||||||
let pb = init_progress_bar::create_pb().await;
|
let pb = init_progress_bar::create_pb().await;
|
||||||
|
|
||||||
let db = init_db::create_db_ws().await?;
|
|
||||||
let reader = File_Format::new(&WIKIDATA_FILE_FORMAT).reader(&WIKIDATA_FILE_NAME)?;
|
let reader = File_Format::new(&WIKIDATA_FILE_FORMAT).reader(&WIKIDATA_FILE_NAME)?;
|
||||||
|
|
||||||
match *CREATE_MODE {
|
CREATE_VERSION
|
||||||
CreateMode::Single => {
|
.run_threaded(
|
||||||
let mut counter = 0;
|
None::<Surreal<Client>>,
|
||||||
for line in reader.lines() {
|
reader,
|
||||||
let mut retries = 0;
|
Some(pb.clone()),
|
||||||
let line = line?;
|
500,
|
||||||
|
1_000,
|
||||||
loop {
|
)
|
||||||
if create_entity(&db, &line).await.is_ok() {
|
.await?;
|
||||||
break;
|
|
||||||
}
|
|
||||||
if retries >= 60 * 10 {
|
|
||||||
panic!("Failed to create entities, too many retries");
|
|
||||||
}
|
|
||||||
retries += 1;
|
|
||||||
sleep(Duration::from_secs(1)).await;
|
|
||||||
if db.use_ns("wikidata").use_db("wikidata").await.is_err() {
|
|
||||||
continue;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
counter += 1;
|
|
||||||
if counter % 100 == 0 {
|
|
||||||
pb.inc(100);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
CreateMode::ThreadedSingle => {
|
|
||||||
CreateVersion::Single
|
|
||||||
.run_threaded(
|
|
||||||
None::<Surreal<Client>>,
|
|
||||||
reader,
|
|
||||||
Some(pb.clone()),
|
|
||||||
2_500,
|
|
||||||
100,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
CreateMode::ThreadedBulk => {
|
|
||||||
CreateVersion::Bulk
|
|
||||||
.run_threaded(
|
|
||||||
None::<Surreal<Client>>,
|
|
||||||
reader,
|
|
||||||
Some(pb.clone()),
|
|
||||||
500,
|
|
||||||
1_000,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
CreateMode::ThreadedBulkFilter => {
|
|
||||||
CreateVersion::BulkFilter
|
|
||||||
.run_threaded(
|
|
||||||
None::<Surreal<Client>>,
|
|
||||||
reader,
|
|
||||||
Some(pb.clone()),
|
|
||||||
500,
|
|
||||||
1_000,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pb.finish();
|
pb.finish();
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|
|
@ -76,11 +76,12 @@ pub async fn create_entity(db: &Surreal<impl Connection>, line: &str) -> Result<
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
#[derive(Clone, Copy, Default)]
|
||||||
pub enum CreateVersion {
|
pub enum CreateVersion {
|
||||||
Single,
|
Single,
|
||||||
|
#[default]
|
||||||
Bulk,
|
Bulk,
|
||||||
// must create a filter.surql file in the root directory
|
/// must create a filter.surql file in the root directory
|
||||||
BulkFilter,
|
BulkFilter,
|
||||||
}
|
}
|
||||||
impl CreateVersion {
|
impl CreateVersion {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue