This commit is contained in:
Elijah McMorris 2023-08-10 10:10:45 +00:00 committed by NexVeridian@gmail.com
parent 1eda4729d4
commit fdc0515939
5 changed files with 183 additions and 82 deletions

View file

@ -1,10 +1,11 @@
[package] [package]
name = "ark-invest-api-rust-data" name = "ark-invest-api-rust-data"
version = "0.1.0" license = "Apache-2.0"
version = "1.0.0"
edition = "2021" edition = "2021"
[dependencies] [dependencies]
polars = { version = "0.30", features = [ polars = { version = "0.32", features = [
"lazy", "lazy",
"strings", "strings",
"parquet", "parquet",
@ -16,7 +17,7 @@ polars = { version = "0.30", features = [
] } ] }
reqwest = { version = "0.11", features = ["blocking", "gzip"] } reqwest = { version = "0.11", features = ["blocking", "gzip"] }
glob = { version = "0.3" } glob = { version = "0.3" }
clokwerk = "0.4.0" clokwerk = "0.4"
strum_macros = "0.25" strum_macros = "0.25"
strum = "0.25" strum = "0.25"
tokio = { version = "1.26", features = ["full"] } tokio = { version = "1.26", features = ["full"] }
@ -25,7 +26,7 @@ chrono = { version = "0.4", features = ["serde"] }
serde_json = "1.0" serde_json = "1.0"
rand = "0.8" rand = "0.8"
futures = "0.3" futures = "0.3"
lazy_static = "1.4.0" lazy_static = "1.4"
[dev-dependencies] [dev-dependencies]
serial_test = "*" serial_test = "*"

View file

@ -29,10 +29,15 @@ pub enum Source {
Read, Read,
// From ARK Invest // From ARK Invest
Ark, Ark,
// From api.NexVeridian.com (Default) // From api.NexVeridian.com
#[default]
ApiIncremental, ApiIncremental,
// From api.NexVeridian.com, not usually nessisary, use ApiIncremental // From api.NexVeridian.com, not usually nessisary, use ApiIncremental
ApiFull, ApiFull,
// From arkfunds.io/api, avoid using, use ApiIncremental instead
ArkFundsIoIncremental,
// From arkfunds.io/api, avoid using, use ApiFull instead
ArkFundsIoFull,
} }
``` ```

View file

@ -19,7 +19,7 @@ lazy_static! {
static ref SOURCE: Source = match env::var("ARK_SOURCE") { static ref SOURCE: Source = match env::var("ARK_SOURCE") {
Ok(val) => Ok(val) =>
Source::from_str(val.as_str()).expect("Env string ARK_SOURCE is not in enum Source"), Source::from_str(val.as_str()).expect("Env string ARK_SOURCE is not in enum Source"),
Err(_e) => Source::ApiIncremental, Err(_) => Source::ApiIncremental,
}; };
} }

View file

@ -1,4 +1,4 @@
use chrono::NaiveDate; use chrono::{Duration, NaiveDate};
use glob::glob; use glob::glob;
use polars::datatypes::DataType; use polars::datatypes::DataType;
use polars::lazy::dsl::StrptimeOptions; use polars::lazy::dsl::StrptimeOptions;
@ -10,15 +10,17 @@ use serde_json::Value;
use std::error::Error; use std::error::Error;
use std::fs::{create_dir_all, File}; use std::fs::{create_dir_all, File};
use std::io::Cursor; use std::io::Cursor;
use std::path::Path; use std::path::Path;
use std::result::Result; use std::result::Result;
use strum_macros::{EnumIter, EnumString}; use strum_macros::{EnumIter, EnumString};
#[derive(strum_macros::Display, EnumIter, Clone, Copy, PartialEq, Debug)] #[derive(Debug, Default, strum_macros::Display, EnumIter, Clone, Copy, PartialEq)]
pub enum Ticker { pub enum Ticker {
ARKVC, ARKVC,
ARKF, ARKF,
ARKG, ARKG,
#[default]
ARKK, ARKK,
ARKQ, ARKQ,
ARKW, ARKW,
@ -80,17 +82,24 @@ impl DFS for Vec<DF> {
} }
} }
#[derive(EnumString, Clone, Copy)] #[derive(Debug, Default, EnumString, Clone, Copy, PartialEq)]
pub enum Source { pub enum Source {
// Reads Parquet file if exists // Reads Parquet file if exists
Read, Read,
// From ARK Invest // From ARK Invest
Ark, Ark,
// From api.NexVeridian.com // From api.NexVeridian.com
#[default]
ApiIncremental, ApiIncremental,
// From api.NexVeridian.com, not usually nessisary, use ApiIncremental // From api.NexVeridian.com, not usually nessisary, use ApiIncremental
ApiFull, ApiFull,
// From arkfunds.io/api, avoid using, use ApiIncremental instead
ArkFundsIoIncremental,
// From arkfunds.io/api, avoid using, use ApiFull instead
ArkFundsIoFull,
} }
#[derive(Clone)]
pub struct Ark { pub struct Ark {
pub df: DF, pub df: DF,
ticker: Ticker, ticker: Ticker,
@ -102,11 +111,11 @@ impl Ark {
ticker: Ticker, ticker: Ticker,
path: Option<String>, path: Option<String>,
) -> Result<Self, Box<dyn Error>> { ) -> Result<Self, Box<dyn Error>> {
let existing_file = Self::read_parquet(ticker, path.clone()).is_ok(); let existing_file = Self::read_parquet(&ticker, path.as_ref()).is_ok();
let mut ark = Self { let mut ark = Self {
df: match existing_file { df: match existing_file {
true => Self::read_parquet(ticker, path.clone())?, true => Self::read_parquet(&ticker, path.as_ref())?,
false => DF::DataFrame(df!["date" => [""],]?), false => DF::DataFrame(df!["date" => [""],]?),
}, },
ticker, ticker,
@ -115,22 +124,16 @@ impl Ark {
let update = match (source, existing_file) { let update = match (source, existing_file) {
(Source::Read, false) => { (Source::Read, false) => {
panic!("Can not read from file, file is empty, does not exist, or is locked") panic!("Can not read from file. file is empty, does not exist, or is locked")
} }
(Source::Read, true) => None, (Source::Read, true) => None,
(Source::Ark, _) => Some(ark.get_csv_ark()?), (Source::Ark, _) => Some(ark.get_csv_ark()?),
(Source::ApiIncremental, true) => { (Source::ApiIncremental, true) | (Source::ArkFundsIoIncremental, true) => {
let last_day = ark let last_day = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap()
.df + Duration::days(ark.df.clone().collect()?.column("date")?.max().unwrap());
.clone() Some(ark.get_api(Some(last_day), Some(&source))?)
.collect()?
.column("date")
.unwrap()
.max()
.and_then(NaiveDate::from_num_days_from_ce_opt);
Some(ark.get_api(last_day)?)
} }
(Source::ApiIncremental, false) | (Source::ApiFull, _) => Some(ark.get_api(None)?), _ => Some(ark.get_api(None, Some(&source))?),
}; };
if let Some(update) = update { if let Some(update) = update {
@ -174,7 +177,7 @@ impl Ark {
Ok(()) Ok(())
} }
fn read_parquet(ticker: Ticker, path: Option<String>) -> Result<DF, Box<dyn Error>> { fn read_parquet(ticker: &Ticker, path: Option<&String>) -> Result<DF, Box<dyn Error>> {
let df = LazyFrame::scan_parquet( let df = LazyFrame::scan_parquet(
match path { match path {
Some(p) => format!("{}/{}.parquet", p, ticker), Some(p) => format!("{}/{}.parquet", p, ticker),
@ -186,20 +189,25 @@ impl Ark {
} }
pub fn sort(mut self) -> Result<Self, Box<dyn Error>> { pub fn sort(mut self) -> Result<Self, Box<dyn Error>> {
self.df = Self::df_sort(self.df.clone())?; self.df = Self::df_sort(self.df)?;
Ok(self) Ok(self)
} }
pub fn df_sort(df: DF) -> Result<DF, Box<dyn Error>> { pub fn df_sort(df: DF) -> Result<DF, Box<dyn Error>> {
Ok(df Ok(df
.collect()? .collect()?
.sort(["date", "weight"], vec![false, true])? .sort(["date", "weight"], vec![false, true], false)?
.into()) .into())
} }
fn concat_df(dfs: Vec<DF>) -> Result<DF, Box<dyn Error>> { fn concat_df(dfs: Vec<DF>) -> Result<DF, Box<dyn Error>> {
// with dedupe // with dedupe
let df = concat(dfs.lazy(), false, true)?; let df = concat(
dfs.lazy(),
UnionArgs {
..Default::default()
},
)?;
Self::dedupe(df.into()) Self::dedupe(df.into())
} }
@ -212,7 +220,7 @@ impl Ark {
} }
pub fn format(mut self) -> Result<Self, Box<dyn Error>> { pub fn format(mut self) -> Result<Self, Box<dyn Error>> {
self.df = Self::df_format(self.df.clone())?; self.df = Self::df_format(self.df)?;
Ok(self) Ok(self)
} }
@ -263,7 +271,8 @@ impl Ark {
} }
if !df.fields().contains(&Field::new("date", DataType::Date)) { if !df.fields().contains(&Field::new("date", DataType::Date)) {
let date_format = |mut df: DataFrame, format:Option<String>| -> Result<DataFrame, Box<dyn Error>> { let date_format =
|mut df: DataFrame, format: Option<String>| -> Result<DataFrame, Box<dyn Error>> {
df = df df = df
.lazy() .lazy()
.with_column(col("date").str().strptime( .with_column(col("date").str().strptime(
@ -271,8 +280,7 @@ impl Ark {
StrptimeOptions { StrptimeOptions {
format, format,
strict: false, strict: false,
exact: true, ..Default::default()
cache: true,
}, },
)) ))
.collect()?; .collect()?;
@ -286,11 +294,9 @@ impl Ark {
if let Ok(x) = date_format(df.clone(), Some("%m/%d/%Y".into())) { if let Ok(x) = date_format(df.clone(), Some("%m/%d/%Y".into())) {
df = x df = x
} } else if let Ok(x) = date_format(df.clone(), Some("%Y/%m/%d".into())) {
else if let Ok(x) = date_format(df.clone(), Some("%Y/%m/%d".into())) {
df = x df = x
} } else if let Ok(x) = date_format(df.clone(), None) {
else if let Ok(x) = date_format(df.clone(), None) {
df = x df = x
} }
} }
@ -321,6 +327,13 @@ impl Ark {
); );
} }
if df
.fields()
.contains(&Field::new("market_value", DataType::Float64))
{
expressions.push(col("market_value").cast(DataType::Int64));
}
if df.fields().contains(&Field::new("shares", DataType::Utf8)) { if df.fields().contains(&Field::new("shares", DataType::Utf8)) {
expressions.push( expressions.push(
col("shares") col("shares")
@ -330,6 +343,53 @@ impl Ark {
); );
} }
// rename values
expressions.push(
col("ticker")
.str()
.replace(lit("DKNG UW"), lit("DKNN"), true)
.str()
.replace(lit("NU UN"), lit("NU"), true)
.str()
.replace(lit("DSY"), lit("DSY FP"), true)
.str()
.replace(lit("GRMN UN"), lit("GRMN"), true)
.str()
.replace(lit("ARCT UQ"), lit("ARCT"), true)
.str()
.replace(lit("PRNT UF"), lit("PRNT"), true),
);
expressions.push(
col("company")
.str()
.replace_all(lit("-A"), lit(""), true)
.str()
.replace_all(lit("- A"), lit(""), true)
.str()
.replace_all(lit("-CL A"), lit(""), true)
.str()
.replace_all(lit("-CLASS A"), lit(""), true)
.str()
.replace_all(lit("Inc"), lit(""), true)
.str()
.replace_all(lit("INC"), lit(""), true)
.str()
.replace_all(lit("LTD"), lit(""), true)
.str()
.replace_all(lit("CORP"), lit(""), true)
.str()
.replace_all(lit("CORPORATION"), lit(""), true)
.str()
.replace_all(lit(","), lit(""), true)
.str()
.replace_all(lit("."), lit(""), true)
.str()
.replace(lit("Blackdaemon"), lit("Blockdaemon"), true)
.str()
.rstrip(None),
);
// run expressions
df = df df = df
.lazy() .lazy()
.with_columns(expressions) .with_columns(expressions)
@ -372,25 +432,55 @@ impl Ark {
Ok(df.into()) Ok(df.into())
} }
pub fn get_api(&self, last_day: Option<NaiveDate>) -> Result<LazyFrame, Box<dyn Error>> { pub fn get_api(
let url = match (self.ticker, last_day) { &self,
last_day: Option<NaiveDate>,
source: Option<&Source>,
) -> Result<DataFrame, Box<dyn Error>> {
let url = match (&self.ticker, last_day) {
(self::Ticker::ARKVC, Some(last_day)) => format!( (self::Ticker::ARKVC, Some(last_day)) => format!(
"https://api.nexveridian.com/arkvc_holdings?start={}", "https://api.nexveridian.com/arkvc_holdings?start={}",
last_day last_day
), ),
(tic, Some(last_day)) => format!( (tic, Some(last_day)) => match source {
Some(Source::ArkFundsIoIncremental) => format!(
"https://arkfunds.io/api/v2/etf/holdings?symbol={}&date_from={}",
tic, last_day
),
_ => format!(
"https://api.nexveridian.com/ark_holdings?ticker={}&start={}", "https://api.nexveridian.com/ark_holdings?ticker={}&start={}",
tic, last_day tic, last_day
), ),
},
(self::Ticker::ARKVC, None) => "https://api.nexveridian.com/arkvc_holdings".to_owned(), (self::Ticker::ARKVC, None) => "https://api.nexveridian.com/arkvc_holdings".to_owned(),
(tic, None) => { (tic, None) => match source {
Some(Source::ArkFundsIoFull) => {
format!("https://arkfunds.io/api/v2/etf/holdings?symbol={}", tic)
}
_ => {
format!("https://api.nexveridian.com/ark_holdings?ticker={}", tic) format!("https://api.nexveridian.com/ark_holdings?ticker={}", tic)
} }
},
}; };
Reader::Json.get_data_url(url)
let mut df = Reader::Json.get_data_url(url)?;
df = match source {
Some(Source::ArkFundsIoIncremental) | Some(Source::ArkFundsIoFull) => {
df = df
.column("holdings")?
.clone()
.explode()?
.struct_()?
.clone()
.unnest();
df
}
_ => df,
};
Ok(df)
} }
pub fn get_csv_ark(&self) -> Result<LazyFrame, Box<dyn Error>> { pub fn get_csv_ark(&self) -> Result<DataFrame, Box<dyn Error>> {
let url = match self.ticker { let url = match self.ticker {
self::Ticker::ARKVC => "https://ark-ventures.com/wp-content/uploads/funds-etf-csv/ARK_VENTURE_FUND_HOLDINGS.csv".to_owned(), self::Ticker::ARKVC => "https://ark-ventures.com/wp-content/uploads/funds-etf-csv/ARK_VENTURE_FUND_HOLDINGS.csv".to_owned(),
_ => format!("https://ark-funds.com/wp-content/uploads/funds-etf-csv/ARK_{}_ETF_{}_HOLDINGS.csv", self.ticker.value(), self.ticker), _ => format!("https://ark-funds.com/wp-content/uploads/funds-etf-csv/ARK_{}_ETF_{}_HOLDINGS.csv", self.ticker.value(), self.ticker),
@ -406,23 +496,30 @@ impl Ark {
for x in glob(&format!("data/csv/{}/*", ticker))?.filter_map(Result::ok) { for x in glob(&format!("data/csv/{}/*", ticker))?.filter_map(Result::ok) {
dfs.push(LazyCsvReader::new(x).finish()?); dfs.push(LazyCsvReader::new(x).finish()?);
} }
let mut df = concat(dfs, false, true)?.into(); let mut df = concat(
dfs,
UnionArgs {
..Default::default()
},
)?
.into();
if Self::read_parquet(ticker, path.clone()).is_ok() { if Self::read_parquet(&ticker, path.as_ref()).is_ok() {
let df_old = Self::read_parquet(ticker, path.clone())?; let df_old = Self::read_parquet(&ticker, path.as_ref())?;
df = Self::concat_df(vec![Self::df_format(df_old)?, Self::df_format(df)?])?; df = Self::concat_df(vec![Self::df_format(df_old)?, Self::df_format(df)?])?;
} }
Ok(Self { df, ticker, path }) Ok(Self { df, ticker, path })
} }
} }
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum Reader { pub enum Reader {
Csv, Csv,
Json, Json,
} }
impl Reader { impl Reader {
pub fn get_data_url(&self, url: String) -> Result<LazyFrame, Box<dyn Error>> { pub fn get_data_url(&self, url: String) -> Result<DataFrame, Box<dyn Error>> {
let mut headers = HeaderMap::new(); let mut headers = HeaderMap::new();
headers.insert( headers.insert(
header::USER_AGENT, header::USER_AGENT,
@ -441,7 +538,9 @@ impl Reader {
let response = Client::builder() let response = Client::builder()
.default_headers(headers) .default_headers(headers)
.gzip(true) .gzip(true)
.build()?.get(url).send()?; .build()?
.get(url)
.send()?;
if !response.status().is_success() { if !response.status().is_success() {
return Err(format!( return Err(format!(
@ -453,17 +552,14 @@ impl Reader {
let data = response.text()?.into_bytes(); let data = response.text()?.into_bytes();
let df: LazyFrame = match self { let df = match self {
Self::Csv => CsvReader::new(Cursor::new(data)) Self::Csv => CsvReader::new(Cursor::new(data))
.has_header(true) .has_header(true)
.finish()? .finish()?,
.lazy(),
Self::Json => { Self::Json => {
let json_string = String::from_utf8(data)?; let json_string = String::from_utf8(data)?;
let json: Value = serde_json::from_str(&json_string)?; let json: Value = serde_json::from_str(&json_string)?;
JsonReader::new(Cursor::new(json.to_string())) JsonReader::new(Cursor::new(json.to_string())).finish()?
.finish()?
.lazy()
} }
}; };

View file

@ -12,8 +12,7 @@ fn get_api_arkk() -> Result<(), Box<dyn Error>> {
Ticker::ARKK, Ticker::ARKK,
Some("data/test".to_owned()), Some("data/test".to_owned()),
)? )?
.get_api(NaiveDate::from_ymd_opt(2023, 5, 18))? .get_api(NaiveDate::from_ymd_opt(2023, 5, 18), None)?;
.collect()?;
let expected = [ let expected = [
"company", "company",
@ -43,7 +42,7 @@ fn get_api_format_arkk() -> Result<(), Box<dyn Error>> {
Ticker::ARKK, Ticker::ARKK,
Some("data/test".to_owned()), Some("data/test".to_owned()),
)? )?
.get_api(NaiveDate::from_ymd_opt(2023, 5, 18))?; .get_api(NaiveDate::from_ymd_opt(2023, 5, 18), None)?;
let df = Ark::df_format(dfl.into())?.collect()?; let df = Ark::df_format(dfl.into())?.collect()?;
assert_eq!( assert_eq!(
@ -83,7 +82,7 @@ fn get_api_format_arkvc() -> Result<(), Box<dyn Error>> {
Ticker::ARKVC, Ticker::ARKVC,
Some("data/test".to_owned()), Some("data/test".to_owned()),
)? )?
.get_api(NaiveDate::from_ymd_opt(2023, 1, 1))?; .get_api(NaiveDate::from_ymd_opt(2023, 1, 1), None)?;
let df = Ark::df_format(dfl.into())?.collect()?; let df = Ark::df_format(dfl.into())?.collect()?;
assert_eq!( assert_eq!(