diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 3c4eea0..7aa0a93 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -57,7 +57,7 @@ "extensions": [ "vadimcn.vscode-lldb", "serayuzgur.crates", - "bungcip.better-toml", + "tamasfe.even-better-toml", "rust-lang.rust-analyzer", "mutantdino.resourcemonitor", "christian-kohler.path-intellisense", diff --git a/.dockerignore b/.dockerignore index e85ef84..729d6d6 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,2 +1,8 @@ +/.devcontainer +/.vscode /target +.dockerignore Cargo.lock +Dockerfile +docker-compose.yml +*.md diff --git a/.gitignore b/.gitignore index a931f3d..270d0b9 100644 --- a/.gitignore +++ b/.gitignore @@ -24,7 +24,6 @@ wheels/ .installed.cfg *.egg MANIFEST -# .vscode # PyInstaller # Usually these files are written by a python script from a template diff --git a/Cargo.toml b/Cargo.toml index 2d1a2d0..4c295c0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ polars = { version = "0.30", features = [ "object", "dtype-struct", ] } -reqwest = { version = "0.11", features = ["blocking"] } +reqwest = { version = "0.11", features = ["blocking", "gzip"] } glob = { version = "0.3" } clokwerk = "0.4.0" strum_macros = "0.25" @@ -24,6 +24,7 @@ openssl = { version = "0.10", features = ["vendored"] } chrono = { version = "0.4", features = ["serde"] } serde_json = "1.0" rand = "0.8" +futures = "0.3" [dev-dependencies] serial_test = "*" diff --git a/Dockerfile b/Dockerfile index 3e55d26..d463d8e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ FROM rust:bookworm AS builder RUN apt-get update && \ - apt install -y musl-tools musl-dev libssl-dev clang mold + apt install -y musl-tools musl-dev libssl-dev clang mold # RUN curl -LsSf https://get.nexte.st/latest/linux | tar zxf - -C ${CARGO_HOME:-~/.cargo}/bin RUN curl -LsSf https://get.nexte.st/latest/linux | tar zxf - -C /usr/local/bin @@ -25,13 +25,10 @@ RUN --mount=type=cache,target=/usr/local/cargo,from=rust,source=/usr/local/cargo cargo nextest run --release --target x86_64-unknown-linux-musl \ -E "all() - test(get_api) - kind(bin)" -FROM alpine:latest +FROM alpine:latest AS main WORKDIR /ark-invest-api-rust-data COPY --from=builder ark-invest-api-rust-data/ark-invest-api-rust-data . -ENV PORT=3000 -EXPOSE 3000 - CMD ["./ark-invest-api-rust-data"] diff --git a/NOTES.md b/NOTES.md index d62b975..2affe9a 100644 --- a/NOTES.md +++ b/NOTES.md @@ -1,4 +1,13 @@ https://ark-funds.com/ark-trade-notifications/ -https://etfs.ark-funds.com/hubfs/idt/trades/ARK_Trade_06072023_0800PM_EST_6480efd1294b5.xls cargo clean && cargo build --timings + +# Futures +https://stackoverflow.com/questions/68448854/how-to-await-for-the-first-k-futures + +# 403 +https://docs.rs/http/latest/http/header/index.html +https://docs.rs/http/latest/http/index.html?search=HOST +https://stackoverflow.com/questions/70931027/http-403-forbidden-is-showing-while-scraping-a-data-from-a-website-using-python +https://stackoverflow.com/questions/48756326/web-scraping-results-in-403-forbidden-error + diff --git a/README.md b/README.md index be42f07..1186fc9 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,20 @@ -Fetches and caches data from csv download and saves the data in parquet format +Fetches and caches ETF data daily, from csv download or api, and saves the data in parquet format + +[api.NexVeridian.com](https://api.NexVeridian.com) + +Not affiliated with Ark Invest + +# Install for csv download +Copy docker-compose.yml + +Create data folder next to docker-compose.yml +``` +├───data +│ └───parquet +├───docker-compose.yml +``` + +`docker compose up --pull always` # Dev Install ## Dev Containers @@ -18,3 +34,12 @@ Run tests with `cargo t` `docker compose build && docker compose up` Remove the cargo cache for buildkit with `docker builder prune --filter type=exec.cachemount` + +# Install for api +`git clone` + +in main.rs change `Source::Ark` to `Source::ApiIncremental` or `Source::ApiFull` for first run + +in docker-compose.yml remove this line`image: ghcr.io/NexVeridian/ark-invest-api-rust-data:latest` + +uncomment everything else diff --git a/docker-compose.yml b/docker-compose.yml index c280888..f45ba81 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,8 +1,8 @@ version: "3" services: ark-invest-api-rust-data: - # image: ghcr.io/NexVeridian/ark-invest-api-rust-data:latest - image: ark-invest-api-rust-data + image: ghcr.io/NexVeridian/ark-invest-api-rust-data:latest + # image: ark-invest-api-rust-data container_name: ark-invest-api-rust-data build: context: . @@ -12,6 +12,14 @@ services: restart: unless-stopped volumes: - ./data:/ark-invest-api-rust-data/data + # ark-invest-api-rust-data-test: + # container_name: ark-invest-api-rust-data-test + # build: + # context: . + # target: test + # args: + # DOCKER_BUILDKIT: 1 + # restart: no volumes: data: diff --git a/src/main.rs b/src/main.rs index ea5fe15..50e8d0a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,38 +1,59 @@ use clokwerk::{AsyncScheduler, Job, TimeUnits}; -// use polars::prelude::LazyFrame; -// use polars::prelude::*; +use futures::future::join_all; use rand::Rng; use std::error::Error; use std::result::Result; -use std::time::Duration; +use std::thread; use strum::IntoEnumIterator; +use tokio::task; +use tokio::time::{sleep, Duration}; mod util; use util::*; + #[tokio::main] async fn main() { let mut scheduler = AsyncScheduler::new(); println!("Scheduler Started"); - scheduler.every(1.day()).at("11:30 pm").run(|| async { - for x in Ticker::iter() { - if x == Ticker::ARKVC { - continue; - } - let plan = || -> Result<(), Box> { - let df = Ark::new(Source::Ark, x, None)? - .format()? - .write_parquet()? - .collect()?; - println!("{:#?}", df.head(Some(1))); - Ok(()) - }; + fn ark_plan(ticker: Ticker) -> Result<(), Box> { + println!("Starting: {:#?}", ticker); + let sec = Duration::from_secs(rand::thread_rng().gen_range(5 * 60..=30 * 60)); + // sleep(sec).await; + thread::sleep(sec); - if plan().is_ok() {} - let sec = rand::thread_rng().gen_range(10..=30); - tokio::time::sleep(Duration::from_secs(sec)).await; - } - }); + let df = Ark::new(Source::Ark, ticker, None)? + .format()? + .write_parquet()? + .collect()?; + + println!("Ticker: {:#?}\n{:#?}", ticker, df.tail(Some(1))); + Ok(()) + } + + async fn spawn_ark_plan(ticker: Ticker) -> Result<(), Box> { + task::spawn_blocking(move || ark_plan(ticker).unwrap()) + .await + .unwrap(); + Ok(()) + } + + async fn ark_etf() { + let futures = Ticker::iter() + .filter(|&x| x != Ticker::ARKVC) + .map(spawn_ark_plan) + .collect::>(); + + join_all(futures).await; + } + + // ark_etf().await; + scheduler.every(1.day()).at("11:30 pm").run(ark_etf); + + scheduler + .every(5.day()) + .at("11:30 pm") + .run(|| async { if spawn_ark_plan(Ticker::ARKVC).await.is_ok() {} }); loop { scheduler.run_pending().await; @@ -40,24 +61,3 @@ async fn main() { tokio::time::sleep(Duration::from_secs(1)).await; } } - -// fn main() -> Result<(), Box> { -// let csv = Ark::merge_old_csv_to_parquet(Ticker::ARKK, None)? -// .format()? -// .write_parquet()? -// .collect()?; -// println!("{:#?}", csv); -// let read = Ark::new(Source::Read, Ticker::ARKK, None)?.collect()?; -// println!("{:#?}", read.dtypes()); -// println!("{:#?}", read.get_column_names()); -// println!("{:#?}", read); -// let api = Ark::new(Source::ApiFull, Ticker::ARKK, None)?.collect()?; -// println!("{:#?}", api); - -// let ark = Ark::new(Source::Ark, Ticker::ARKK, None)?.collect()?; -// println!("{:#?}", ark); - -// let ark = Ark::new(Source::Ark, Ticker::ARKVC, None)?.collect()?; -// println!("{:#?}", ark); -// Ok(()) -// } diff --git a/src/util.rs b/src/util.rs index 8107be1..b286c6c 100644 --- a/src/util.rs +++ b/src/util.rs @@ -4,6 +4,8 @@ use polars::datatypes::DataType; use polars::lazy::dsl::StrptimeOptions; use polars::prelude::*; use reqwest::blocking::Client; +use reqwest::header; +use reqwest::header::{HeaderMap, HeaderValue}; use serde_json::Value; use std::error::Error; use std::fs::{create_dir_all, File}; @@ -12,7 +14,7 @@ use std::path::Path; use std::result::Result; use strum_macros::EnumIter; -#[derive(strum_macros::Display, EnumIter, Clone, Copy, PartialEq)] +#[derive(strum_macros::Display, EnumIter, Clone, Copy, PartialEq, Debug)] pub enum Ticker { ARKVC, ARKF, @@ -408,8 +410,24 @@ pub enum Reader { impl Reader { pub fn get_data_url(&self, url: String) -> Result> { + let mut headers = HeaderMap::new(); + headers.insert( + header::USER_AGENT, + HeaderValue::from_static("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"), + ); + + headers.insert( + header::ACCEPT, + HeaderValue::from_static("text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"), + ); + headers.insert( + header::ACCEPT_LANGUAGE, + HeaderValue::from_static("en-US,en;q=0.8"), + ); + let response = Client::builder() - .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3") + .default_headers(headers) + .gzip(true) .build()?.get(url).send()?; if !response.status().is_success() {