diff --git a/.gitignore b/.gitignore index b089282..a931f3d 100644 --- a/.gitignore +++ b/.gitignore @@ -24,7 +24,7 @@ wheels/ .installed.cfg *.egg MANIFEST -.vscode +# .vscode # PyInstaller # Usually these files are written by a python script from a template diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..5e7d094 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,43 @@ +{ + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "type": "lldb", + "request": "launch", + "name": "Debug executable 'ark-invest-api-rust-data'", + "cargo": { + "args": [ + "build", + "--bin=ark-invest-api-rust-data", + "--package=ark-invest-api-rust-data" + ], + "filter": { + "name": "ark-invest-api-rust-data", + "kind": "bin" + } + }, + "args": [], + "cwd": "${workspaceFolder}" + }, + { + "type": "lldb", + "request": "launch", + "name": "Debug unit tests in executable 'ark-invest-api-rust-data'", + "cargo": { + "args": [ + "test", + "--no-run", + "--bin=ark-invest-api-rust-data", + "--package=ark-invest-api-rust-data" + ], + "filter": { + "name": "ark-invest-api-rust-data", + "kind": "bin" + } + }, + "args": [], + "cwd": "${workspaceFolder}" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..dfc4962 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,17 @@ +{ + "[Rust]": { + "editor.defaultFormatter": "rust-lang.rust-analyzer", + "editor.formatOnSave": true, + "editor.formatOnSaveMode": "file" + }, + "rust-analyzer.check.command": "clippy", + "rust-analyzer.cargo.buildScripts.overrideCommand": [ + "cargo", + "clippy", + "--fix", + "--workspace", + "--message-format=json", + "--all-targets", + "--allow-dirty" + ], +} \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 3944ec3..82c4f68 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -polars = { version = "0.28", features = [ +polars = { version = "0.30", features = [ "lazy", "strings", "parquet", diff --git a/README.md b/README.md index 22c3385..614eed4 100644 --- a/README.md +++ b/README.md @@ -1 +1,18 @@ Fetches and caches data from csv download and saves the data in parquet format + +# Dev Install +## Dev Containers +Install docker, vscode, [Remote Development Extension Pack](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.vscode-remote-extensionpack), and the [GitHub Repositories Extension](https://marketplace.visualstudio.com/items?itemName=GitHub.remotehub) + +`Ctrl+Shift+P` **Dev Containers: Clone Repository in Container Volume** + +Select github then paste the url `https://github.com/NexVeridian/ark-invest-api-rust-data` + +Run code with `F5` or `cargo run` + +## Docker Compose +`git clone` + +`docker compose build && docker compose up` + +Remove the cargo cache for buildkit with `docker builder prune --filter type=exec.cachemount` diff --git a/src/main.rs b/src/main.rs index 8fff30c..6aa2c06 100644 --- a/src/main.rs +++ b/src/main.rs @@ -40,10 +40,14 @@ fn main() { let dfn = df_format(read_parquet(Ticker::ARKVC).unwrap()).unwrap(); println!("{:#?}", dfn); - // update_parquet(Ticker::ARKVC).unwrap(); - // let update = df_format(Ticker::ARKF, get_csv(Ticker::ARKF).unwrap()).unwrap(); - // let update = get_csv(Ticker::ARKF).unwrap().collect().unwrap(); + // let update = df_format(get_csv(Ticker::ARKF).unwrap()).unwrap(); + // println!("{:#?}", update); + // update_parquet(Ticker::ARKVC).unwrap(); + // let x = df_format(read_parquet(Ticker::ARKVC).unwrap()).unwrap(); + // println!("{:#?}", x); + + // merge_csv_to_parquet(Ticker::ARKVC).unwrap(); // let x = df_format(read_parquet(Ticker::ARKVC).unwrap()).unwrap(); // println!("{:#?}", x); } diff --git a/src/util.rs b/src/util.rs index b6476c7..1ae543d 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,8 +1,7 @@ use glob::glob; use polars::datatypes::DataType; -use polars::lazy::dsl::StrpTimeOptions; use polars::prelude::*; -use polars::prelude::{DataFrame, UniqueKeepStrategy}; +use polars::prelude::{DataFrame, StrptimeOptions, UniqueKeepStrategy}; use reqwest::blocking::Client; use std::error::Error; use std::fs::File; @@ -35,16 +34,28 @@ impl Ticker { } } -pub fn merge_csv_to_parquet(folder: Ticker) -> Result<(), Box> { +pub fn merge_csv_to_parquet(ticker: Ticker) -> Result<(), Box> { let mut dfs = vec![]; - for x in glob(&format!("data/csv/{}/*", folder))?.filter_map(Result::ok) { + for x in glob(&format!("data/csv/{}/*", ticker))?.filter_map(Result::ok) { dfs.push(LazyCsvReader::new(x).finish()?); } - let df = concat(dfs, false, true)?; + let mut df = concat(dfs, false, true)?; + + if read_parquet(ticker).is_ok() { + let df_old = read_parquet(ticker)?; + df = concat( + vec![df_format(df_old)?.lazy(), df_format(df)?.lazy()], + false, + true, + )? + .unique_stable(None, UniqueKeepStrategy::First); + write_parquet(ticker, df_sort(df.collect()?)?)?; + } else { + write_parquet(ticker, df_format(df)?)?; + } - write_parquet(folder, df_format(df)?)?; Ok(()) } @@ -60,7 +71,7 @@ pub fn update_parquet(ticker: Ticker) -> Result<(), Box> { )? .unique_stable(None, UniqueKeepStrategy::First); - write_parquet(ticker, df.collect()?)?; + write_parquet(ticker, df_sort(df.collect()?)?)?; Ok(()) } @@ -79,6 +90,10 @@ pub fn write_parquet(ticker: Ticker, mut df: DataFrame) -> Result<(), Box Result> { + Ok(df.sort(["date", "weight"], vec![false, true])?) +} + pub fn df_format(df: LazyFrame) -> Result> { let mut df = df.collect()?; @@ -86,8 +101,8 @@ pub fn df_format(df: LazyFrame) -> Result> { df = df .lazy() .rename( - vec!["market_value_($), weight_(%)"], - vec!["market_value, weight"], + vec!["market_value_($)", "weight_(%)"], + vec!["market_value", "weight"], ) .collect()?; } @@ -95,8 +110,8 @@ pub fn df_format(df: LazyFrame) -> Result> { df = df .lazy() .rename( - vec!["market value ($), weight (%)"], - vec!["market_value, weight"], + vec!["market value ($)", "weight (%)"], + vec!["market_value", "weight"], ) .collect()?; } @@ -123,15 +138,15 @@ pub fn df_format(df: LazyFrame) -> Result> { let mut expressions: Vec = vec![]; if !df.fields().contains(&Field::new("date", DataType::Date)) { - expressions.push(col("date").str().strptime(StrpTimeOptions { - date_dtype: DataType::Date, - fmt: Some("%m/%d/%Y".into()), - strict: false, - exact: true, - cache: false, - tz_aware: false, - utc: false, - })); + expressions.push(col("date").str().strptime( + DataType::Date, + StrptimeOptions { + format: Some("%m/%d/%Y".into()), + strict: false, + exact: true, + cache: false, + }, + )); } if df.fields().contains(&Field::new("weight", DataType::Utf8)) { @@ -208,16 +223,8 @@ pub fn df_format(df: LazyFrame) -> Result> { pub fn get_csv(ticker: Ticker) -> Result> { let url = match ticker { - Ticker::ARKVC => { - "https://ark-ventures.com/wp-content/uploads/funds-etf-csv/ARK_VENTURE_FUND_HOLDINGS.csv".to_owned() - } - _ => { - format!( - "https://ark-funds.com/wp-content/uploads/funds-etf-csv/ARK_{}_ETF_{}_HOLDINGS.csv", - ticker.value(), - ticker - ) - } + Ticker::ARKVC => "https://ark-ventures.com/wp-content/uploads/funds-etf-csv/ARK_VENTURE_FUND_HOLDINGS.csv".to_owned(), + _ => format!("https://ark-funds.com/wp-content/uploads/funds-etf-csv/ARK_{}_ETF_{}_HOLDINGS.csv", ticker.value(), ticker), }; let response = Client::builder() @@ -232,7 +239,7 @@ pub fn get_csv(ticker: Ticker) -> Result> { .into()); } - let data: Vec = response.text()?.bytes().collect(); + let data = response.text()?.into_bytes(); let df = CsvReader::new(Cursor::new(data)) .has_header(true)