mirror of
https://github.com/NexVeridian/wikidata-to-surrealdb.git
synced 2025-09-02 09:59:13 +00:00
refactor: CreateVersion and run_threaded
This commit is contained in:
parent
17a115f473
commit
b885315cd7
5 changed files with 210 additions and 235 deletions
|
@ -28,14 +28,8 @@ fn bench(c: &mut Criterion) {
|
||||||
.reader("tests/data/bench.json")
|
.reader("tests/data/bench.json")
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
create_db_entities_threaded(
|
CreateVersion::Single
|
||||||
Some(db.clone()),
|
.run_threaded(Some(db.clone()), reader, None, 1000, 100)
|
||||||
reader,
|
|
||||||
None,
|
|
||||||
1000,
|
|
||||||
100,
|
|
||||||
CreateVersion::Single,
|
|
||||||
)
|
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
})
|
})
|
||||||
|
@ -51,14 +45,8 @@ fn bench(c: &mut Criterion) {
|
||||||
.reader("tests/data/bench.json")
|
.reader("tests/data/bench.json")
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
create_db_entities_threaded(
|
CreateVersion::Bulk
|
||||||
Some(db.clone()),
|
.run_threaded(Some(db.clone()), reader, None, 1000, 100)
|
||||||
reader,
|
|
||||||
None,
|
|
||||||
1000,
|
|
||||||
100,
|
|
||||||
CreateVersion::Bulk,
|
|
||||||
)
|
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
})
|
})
|
||||||
|
|
14
src/main.rs
14
src/main.rs
|
@ -49,7 +49,7 @@ async fn main() -> Result<(), Error> {
|
||||||
let line = line?;
|
let line = line?;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
if create_db_entity(&db, &line).await.is_ok() {
|
if create_entity(&db, &line).await.is_ok() {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if retries >= 60 * 10 {
|
if retries >= 60 * 10 {
|
||||||
|
@ -69,35 +69,35 @@ async fn main() -> Result<(), Error> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
CreateMode::ThreadedSingle => {
|
CreateMode::ThreadedSingle => {
|
||||||
create_db_entities_threaded(
|
CreateVersion::Single
|
||||||
|
.run_threaded(
|
||||||
None::<Surreal<Client>>,
|
None::<Surreal<Client>>,
|
||||||
reader,
|
reader,
|
||||||
Some(pb.clone()),
|
Some(pb.clone()),
|
||||||
2_500,
|
2_500,
|
||||||
100,
|
100,
|
||||||
CreateVersion::Single,
|
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
CreateMode::ThreadedBulk => {
|
CreateMode::ThreadedBulk => {
|
||||||
create_db_entities_threaded(
|
CreateVersion::Bulk
|
||||||
|
.run_threaded(
|
||||||
None::<Surreal<Client>>,
|
None::<Surreal<Client>>,
|
||||||
reader,
|
reader,
|
||||||
Some(pb.clone()),
|
Some(pb.clone()),
|
||||||
500,
|
500,
|
||||||
1_000,
|
1_000,
|
||||||
CreateVersion::Bulk,
|
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
CreateMode::ThreadedBulkFilter => {
|
CreateMode::ThreadedBulkFilter => {
|
||||||
create_db_entities_threaded(
|
CreateVersion::BulkFilter
|
||||||
|
.run_threaded(
|
||||||
None::<Surreal<Client>>,
|
None::<Surreal<Client>>,
|
||||||
reader,
|
reader,
|
||||||
Some(pb.clone()),
|
Some(pb.clone()),
|
||||||
500,
|
500,
|
||||||
1_000,
|
1_000,
|
||||||
CreateVersion::BulkFilter,
|
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
|
|
216
src/utils.rs
216
src/utils.rs
|
@ -51,7 +51,7 @@ impl File_Format {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn create_db_entity(db: &Surreal<impl Connection>, line: &str) -> Result<(), Error> {
|
pub async fn create_entity(db: &Surreal<impl Connection>, line: &str) -> Result<(), Error> {
|
||||||
let line = line.trim().trim_end_matches(',').to_string();
|
let line = line.trim().trim_end_matches(',').to_string();
|
||||||
if line == "[" || line == "]" {
|
if line == "[" || line == "]" {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
|
@ -76,14 +76,108 @@ pub async fn create_db_entity(db: &Surreal<impl Connection>, line: &str) -> Resu
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn create_db_entities(
|
#[derive(Clone, Copy)]
|
||||||
|
pub enum CreateVersion {
|
||||||
|
Single,
|
||||||
|
Bulk,
|
||||||
|
// must create a filter.surql file in the root directory
|
||||||
|
BulkFilter,
|
||||||
|
}
|
||||||
|
impl CreateVersion {
|
||||||
|
pub async fn run(
|
||||||
|
self,
|
||||||
|
db: &Surreal<impl Connection>,
|
||||||
|
chunk: &[String],
|
||||||
|
pb: &Option<ProgressBar>,
|
||||||
|
batch_size: usize,
|
||||||
|
) -> bool {
|
||||||
|
match self {
|
||||||
|
CreateVersion::Single => self.create_single(db, chunk, pb).await.is_ok(),
|
||||||
|
CreateVersion::Bulk => self.create_bulk(db, chunk, pb, batch_size).await.is_ok(),
|
||||||
|
CreateVersion::BulkFilter => self
|
||||||
|
.create_bulk_filter(db, chunk, pb, batch_size)
|
||||||
|
.await
|
||||||
|
.is_ok(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn run_threaded(
|
||||||
|
self,
|
||||||
|
dbo: Option<Surreal<impl Connection>>,
|
||||||
|
reader: Box<dyn BufRead>, // None::<Surreal<Client>>
|
||||||
|
pb: Option<ProgressBar>,
|
||||||
|
batch_size: usize,
|
||||||
|
batch_num: usize,
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
let mut lines = reader.lines().peekable();
|
||||||
|
let mut futures = Vec::new();
|
||||||
|
|
||||||
|
while lines.peek().is_some() {
|
||||||
|
let chunk: Vec<String> = lines
|
||||||
|
.by_ref()
|
||||||
|
.take(batch_size)
|
||||||
|
.filter_map(Result::ok)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
futures.push(self.spawn_chunk(dbo.clone(), chunk, pb.clone(), batch_size));
|
||||||
|
|
||||||
|
if futures.len() >= batch_num {
|
||||||
|
join_all(futures).await;
|
||||||
|
futures = Vec::new();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
join_all(futures).await;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn spawn_chunk(
|
||||||
|
&self,
|
||||||
|
dbo: Option<Surreal<impl Connection>>,
|
||||||
|
chunk: Vec<String>,
|
||||||
|
pb: Option<ProgressBar>,
|
||||||
|
batch_size: usize,
|
||||||
|
) -> tokio::task::JoinHandle<()> {
|
||||||
|
let create_version = *self;
|
||||||
|
|
||||||
|
tokio::spawn(async move {
|
||||||
|
let mut retries = 0;
|
||||||
|
loop {
|
||||||
|
match dbo {
|
||||||
|
Some(ref db) => {
|
||||||
|
if create_version.run(db, &chunk, &pb, batch_size).await {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
let db = match init_db::create_db_ws().await {
|
||||||
|
Ok(db) => db,
|
||||||
|
Err(_) => continue,
|
||||||
|
};
|
||||||
|
if create_version.run(&db, &chunk, &pb, batch_size).await {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if retries >= 60 * 10 {
|
||||||
|
panic!("Failed to create entities, too many retries");
|
||||||
|
}
|
||||||
|
retries += 1;
|
||||||
|
sleep(Duration::from_millis(250)).await;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn create_single(
|
||||||
|
self,
|
||||||
db: &Surreal<impl Connection>,
|
db: &Surreal<impl Connection>,
|
||||||
lines: &[String],
|
lines: &[String],
|
||||||
pb: &Option<ProgressBar>,
|
pb: &Option<ProgressBar>,
|
||||||
) -> Result<(), Error> {
|
) -> Result<(), Error> {
|
||||||
let mut counter = 0;
|
let mut counter = 0;
|
||||||
for line in lines {
|
for line in lines {
|
||||||
create_db_entity(db, line).await?;
|
create_entity(db, line).await?;
|
||||||
counter += 1;
|
counter += 1;
|
||||||
if counter % 100 == 0 {
|
if counter % 100 == 0 {
|
||||||
if let Some(ref p) = pb {
|
if let Some(ref p) = pb {
|
||||||
|
@ -94,7 +188,8 @@ pub async fn create_db_entities(
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn create_db_entities_bulk(
|
async fn create_bulk(
|
||||||
|
self,
|
||||||
db: &Surreal<impl Connection>,
|
db: &Surreal<impl Connection>,
|
||||||
lines: &[String],
|
lines: &[String],
|
||||||
pb: &Option<ProgressBar>,
|
pb: &Option<ProgressBar>,
|
||||||
|
@ -143,14 +238,15 @@ pub async fn create_db_entities_bulk(
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn create_db_entities_bulk_filter(
|
async fn create_bulk_filter(
|
||||||
|
self,
|
||||||
db: &Surreal<impl Connection>,
|
db: &Surreal<impl Connection>,
|
||||||
lines: &[String],
|
lines: &[String],
|
||||||
pb: &Option<ProgressBar>,
|
pb: &Option<ProgressBar>,
|
||||||
batch_size: usize,
|
batch_size: usize,
|
||||||
) -> Result<(), Error> {
|
) -> Result<(), Error> {
|
||||||
let db_mem = init_db::create_db_mem().await?;
|
let db_mem = init_db::create_db_mem().await?;
|
||||||
create_db_entities_bulk(&db_mem, lines, &None, batch_size).await?;
|
self.create_bulk(&db_mem, lines, &None, batch_size).await?;
|
||||||
|
|
||||||
let filter = tokio::fs::read_to_string(&*FILTER_PATH).await?;
|
let filter = tokio::fs::read_to_string(&*FILTER_PATH).await?;
|
||||||
db_mem.query(filter).await?;
|
db_mem.query(filter).await?;
|
||||||
|
@ -174,112 +270,4 @@ pub async fn create_db_entities_bulk_filter(
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
|
||||||
pub enum CreateVersion {
|
|
||||||
Single,
|
|
||||||
Bulk,
|
|
||||||
// must create a filter.surql file in the root directory
|
|
||||||
BulkFilter,
|
|
||||||
}
|
|
||||||
impl CreateVersion {
|
|
||||||
pub async fn run(
|
|
||||||
self,
|
|
||||||
db: &Surreal<impl Connection>,
|
|
||||||
chunk: &[String],
|
|
||||||
pb: &Option<ProgressBar>,
|
|
||||||
batch_size: usize,
|
|
||||||
) -> bool {
|
|
||||||
match self {
|
|
||||||
CreateVersion::Single => create_db_entities(db, chunk, pb).await.is_ok(),
|
|
||||||
CreateVersion::Bulk => create_db_entities_bulk(db, chunk, pb, batch_size)
|
|
||||||
.await
|
|
||||||
.is_ok(),
|
|
||||||
CreateVersion::BulkFilter => create_db_entities_bulk_filter(db, chunk, pb, batch_size)
|
|
||||||
.await
|
|
||||||
.is_ok(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn create_db_entities_threaded(
|
|
||||||
dbo: Option<Surreal<impl Connection>>, // None::<Surreal<Client>>
|
|
||||||
reader: Box<dyn BufRead>,
|
|
||||||
pb: Option<ProgressBar>,
|
|
||||||
batch_size: usize,
|
|
||||||
batch_num: usize,
|
|
||||||
create_version: CreateVersion,
|
|
||||||
) -> Result<(), Error> {
|
|
||||||
let mut futures = Vec::new();
|
|
||||||
let mut chunk = Vec::with_capacity(batch_size);
|
|
||||||
let mut chunk_counter = 0;
|
|
||||||
let mut lines = reader.lines();
|
|
||||||
let mut last_loop = false;
|
|
||||||
|
|
||||||
loop {
|
|
||||||
let line = lines.next();
|
|
||||||
match line {
|
|
||||||
Some(line) => chunk.push(line?),
|
|
||||||
None => last_loop = true,
|
|
||||||
};
|
|
||||||
|
|
||||||
if chunk.len() >= batch_size || last_loop {
|
|
||||||
let dbo = dbo.clone();
|
|
||||||
let pb = pb.clone();
|
|
||||||
|
|
||||||
futures.push(tokio::spawn(async move {
|
|
||||||
let mut retries = 0;
|
|
||||||
loop {
|
|
||||||
match dbo {
|
|
||||||
Some(ref db) => {
|
|
||||||
if create_version.run(db, &chunk, &pb, batch_size).await {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if db.use_ns("wikidata").use_db("wikidata").await.is_err() {
|
|
||||||
continue;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
let db = if let Ok(db) = init_db::create_db_ws().await {
|
|
||||||
db
|
|
||||||
} else {
|
|
||||||
continue;
|
|
||||||
};
|
|
||||||
if create_version.run(&db, &chunk, &pb, batch_size).await {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if retries >= 60 * 10 {
|
|
||||||
panic!("Failed to create entities, too many retries");
|
|
||||||
}
|
|
||||||
retries += 1;
|
|
||||||
sleep(Duration::from_millis(250)).await;
|
|
||||||
}
|
|
||||||
}));
|
|
||||||
chunk_counter += 1;
|
|
||||||
chunk = Vec::with_capacity(batch_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
if chunk_counter >= batch_num || last_loop {
|
|
||||||
join_all(futures).await;
|
|
||||||
futures = Vec::new();
|
|
||||||
chunk_counter = 0;
|
|
||||||
}
|
|
||||||
if last_loop {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
match dbo {
|
|
||||||
Some(db) => {
|
|
||||||
create_db_entities(&db, &chunk, &pb).await?;
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
create_db_entities(&init_db::create_db_ws().await?, &chunk, &pb).await?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
join_all(futures).await;
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,5 +34,6 @@ pub async fn create_db_ws() -> Result<Surreal<Client>, Error> {
|
||||||
pub async fn create_db_mem() -> Result<Surreal<Db>, Error> {
|
pub async fn create_db_mem() -> Result<Surreal<Db>, Error> {
|
||||||
let db = Surreal::new::<Mem>(()).await?;
|
let db = Surreal::new::<Mem>(()).await?;
|
||||||
db.use_ns("wikidata").use_db("wikidata").await?;
|
db.use_ns("wikidata").use_db("wikidata").await?;
|
||||||
|
|
||||||
Ok(db)
|
Ok(db)
|
||||||
}
|
}
|
||||||
|
|
|
@ -42,7 +42,7 @@ async fn entity() {
|
||||||
let reader = init_reader("json", "Entity");
|
let reader = init_reader("json", "Entity");
|
||||||
|
|
||||||
for line in reader.lines() {
|
for line in reader.lines() {
|
||||||
create_db_entity(&db, &line.unwrap()).await.unwrap();
|
create_entity(&db, &line.unwrap()).await.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
assert_eq!(51.0, entity_query(&db).await.unwrap().unwrap())
|
assert_eq!(51.0, entity_query(&db).await.unwrap().unwrap())
|
||||||
|
@ -56,7 +56,9 @@ async fn entity_threaded(#[case] version: CreateVersion) -> Result<(), Error> {
|
||||||
let db = inti_db().await?;
|
let db = inti_db().await?;
|
||||||
let reader = init_reader("json", "Entity");
|
let reader = init_reader("json", "Entity");
|
||||||
|
|
||||||
create_db_entities_threaded(Some(db.clone()), reader, None, 1_000, 100, version).await?;
|
version
|
||||||
|
.run_threaded(Some(db.clone()), reader, None, 1_000, 100)
|
||||||
|
.await?;
|
||||||
|
|
||||||
assert_eq!(51.0, entity_query(&db).await?.unwrap());
|
assert_eq!(51.0, entity_query(&db).await?.unwrap());
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -68,14 +70,8 @@ async fn entity_threaded_filter() -> Result<(), Error> {
|
||||||
let db = inti_db().await?;
|
let db = inti_db().await?;
|
||||||
let reader = init_reader("json", "bench");
|
let reader = init_reader("json", "bench");
|
||||||
|
|
||||||
create_db_entities_threaded(
|
CreateVersion::BulkFilter
|
||||||
Some(db.clone()),
|
.run_threaded(Some(db.clone()), reader, None, 1_000, 100)
|
||||||
reader,
|
|
||||||
None,
|
|
||||||
1_000,
|
|
||||||
100,
|
|
||||||
CreateVersion::BulkFilter,
|
|
||||||
)
|
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let count: Option<f32> = db
|
let count: Option<f32> = db
|
||||||
|
@ -105,7 +101,7 @@ async fn property() {
|
||||||
let reader = init_reader("json", "Property");
|
let reader = init_reader("json", "Property");
|
||||||
|
|
||||||
for line in reader.lines() {
|
for line in reader.lines() {
|
||||||
create_db_entity(&db, &line.unwrap()).await.unwrap();
|
create_entity(&db, &line.unwrap()).await.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
assert_eq!(2.0, property_query(&db).await.unwrap().unwrap())
|
assert_eq!(2.0, property_query(&db).await.unwrap().unwrap())
|
||||||
|
@ -119,7 +115,9 @@ async fn property_threaded(#[case] version: CreateVersion) -> Result<(), Error>
|
||||||
let db = inti_db().await?;
|
let db = inti_db().await?;
|
||||||
let reader = init_reader("json", "Property");
|
let reader = init_reader("json", "Property");
|
||||||
|
|
||||||
create_db_entities_threaded(Some(db.clone()), reader, None, 1_000, 100, version).await?;
|
version
|
||||||
|
.run_threaded(Some(db.clone()), reader, None, 1_000, 100)
|
||||||
|
.await?;
|
||||||
|
|
||||||
assert_eq!(2.0, property_query(&db).await?.unwrap());
|
assert_eq!(2.0, property_query(&db).await?.unwrap());
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue