Initial
This commit is contained in:
commit
71f42e1d88
14 changed files with 2142 additions and 0 deletions
1
.env
Normal file
1
.env
Normal file
|
@ -0,0 +1 @@
|
|||
DATABASE_URL=postgres://postgres:example@localhost:5433/searchinblog
|
1
.envrc
Normal file
1
.envrc
Normal file
|
@ -0,0 +1 @@
|
|||
use flake
|
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
/target
|
||||
.direnv
|
1832
Cargo.lock
generated
Normal file
1832
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
14
Cargo.toml
Normal file
14
Cargo.toml
Normal file
|
@ -0,0 +1,14 @@
|
|||
[package]
|
||||
name = "searchinblog"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.68"
|
||||
axum = { version = "0.6.4", features = ["http2", "macros"] }
|
||||
clap = { version = "4.1.4", features = ["derive", "cargo"] }
|
||||
reqwest = { version = "0.11.14", default-features = false, features = ["rustls-tls-webpki-roots", "json", "gzip", "mime_guess"] }
|
||||
scraper = "0.14.0"
|
||||
tokio = { version = "1.24.2", features = ["full"] }
|
14
README.md
Normal file
14
README.md
Normal file
|
@ -0,0 +1,14 @@
|
|||
## Crawling requirements
|
||||
|
||||
- You must serve a robots.txt to describe the terms of crawling. Otherwise, it
|
||||
will be ignored by this bot.
|
||||
|
||||
- You must serve an atom feed.
|
||||
|
||||
## RFC References
|
||||
|
||||
- **[RFC4287]**: The Atom Syndication Format
|
||||
- **[RFC9309]**: Proposed Robots Exclusion Protocol
|
||||
|
||||
[rfc4287]: https://www.rfc-editor.org/rfc/rfc4287
|
||||
[rfc9309]: https://www.rfc-editor.org/rfc/rfc9309
|
12
default.nix
Normal file
12
default.nix
Normal file
|
@ -0,0 +1,12 @@
|
|||
{ toolchain, makeRustPlatform, pkg-config }:
|
||||
|
||||
let rustPlatform = makeRustPlatform { inherit (toolchain) cargo rustc; };
|
||||
|
||||
in rustPlatform.buildRustPackage {
|
||||
name = "searchinblog";
|
||||
src = ./.;
|
||||
cargoLock.lockFile = ./Cargo.lock;
|
||||
|
||||
nativeBuildInputs = [ pkg-config ];
|
||||
buildInputs = [ ];
|
||||
}
|
8
docker-compose.yml
Normal file
8
docker-compose.yml
Normal file
|
@ -0,0 +1,8 @@
|
|||
version: "3"
|
||||
|
||||
services:
|
||||
db:
|
||||
image: postgres
|
||||
ports: [5433:5432]
|
||||
environment:
|
||||
POSTGRES_PASSWORD: example
|
93
flake.lock
Normal file
93
flake.lock
Normal file
|
@ -0,0 +1,93 @@
|
|||
{
|
||||
"nodes": {
|
||||
"fenix": {
|
||||
"inputs": {
|
||||
"nixpkgs": "nixpkgs",
|
||||
"rust-analyzer-src": "rust-analyzer-src"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1674541370,
|
||||
"narHash": "sha256-L62dKDX6fIUQhlna9R8PKSAEGZ7ueU5gRGxZzZs/Zx8=",
|
||||
"owner": "nix-community",
|
||||
"repo": "fenix",
|
||||
"rev": "4435f8e9da13581e51ba1f92a25d7d54c776ad94",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-community",
|
||||
"repo": "fenix",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-utils": {
|
||||
"locked": {
|
||||
"lastModified": 1667395993,
|
||||
"narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"id": "flake-utils",
|
||||
"type": "indirect"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1674211260,
|
||||
"narHash": "sha256-xU6Rv9sgnwaWK7tgCPadV6HhI2Y/fl4lKxJoG2+m9qs=",
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "5ed481943351e9fd354aeb557679624224de38d5",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nixos",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs_2": {
|
||||
"locked": {
|
||||
"lastModified": 1667629849,
|
||||
"narHash": "sha256-P+v+nDOFWicM4wziFK9S/ajF2lc0N2Rg9p6Y35uMoZI=",
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "3bacde6273b09a21a8ccfba15586fb165078fb62",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"id": "nixpkgs",
|
||||
"type": "indirect"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"fenix": "fenix",
|
||||
"flake-utils": "flake-utils",
|
||||
"nixpkgs": "nixpkgs_2"
|
||||
}
|
||||
},
|
||||
"rust-analyzer-src": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1674491573,
|
||||
"narHash": "sha256-1hMPOn2dlMfWRWvuaWcSxNquKpvjGVXq2rVw6UJy75Q=",
|
||||
"owner": "rust-lang",
|
||||
"repo": "rust-analyzer",
|
||||
"rev": "c552e5a55f13b2f08d506bb46fb74dbc11702d0d",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "rust-lang",
|
||||
"ref": "nightly",
|
||||
"repo": "rust-analyzer",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
28
flake.nix
Normal file
28
flake.nix
Normal file
|
@ -0,0 +1,28 @@
|
|||
{
|
||||
inputs = { fenix.url = "github:nix-community/fenix"; };
|
||||
|
||||
outputs = { self, nixpkgs, flake-utils, fenix }:
|
||||
flake-utils.lib.eachDefaultSystem (system:
|
||||
let
|
||||
pkgs = import nixpkgs {
|
||||
inherit system;
|
||||
overlays = [ fenix.overlays.default ];
|
||||
};
|
||||
|
||||
toolchain = pkgs.fenix.stable;
|
||||
|
||||
flakePkgs = {
|
||||
searchinblog = pkgs.callPackage ./. { inherit toolchain; };
|
||||
};
|
||||
in rec {
|
||||
packages = flake-utils.lib.flattenTree flakePkgs;
|
||||
defaultPackage = packages.searchinblog;
|
||||
|
||||
devShell = pkgs.mkShell {
|
||||
inputsFrom = with packages; [ searchinblog ];
|
||||
packages = (with pkgs; [ cargo-watch cargo-deny cargo-edit sqlx-cli ])
|
||||
++ (with toolchain; [ cargo rustc rustfmt ]);
|
||||
CARGO_UNSTABLE_SPARSE_REGISTRY = "true";
|
||||
};
|
||||
});
|
||||
}
|
34
migrations/20230125084829_initial.sql
Normal file
34
migrations/20230125084829_initial.sql
Normal file
|
@ -0,0 +1,34 @@
|
|||
CREATE TABLE websites (
|
||||
id INTEGER PRIMARY KEY,
|
||||
domain TEXT UNIQUE,
|
||||
last_collected TIMESTAMP WITHOUT TIME ZONE
|
||||
);
|
||||
|
||||
CREATE TABLE posts (
|
||||
id INTEGER PRIMARY KEY,
|
||||
url TEXT UNIQUE,
|
||||
last_updated_by_version TEXT,
|
||||
content TEXT,
|
||||
content_tsv TSVECTOR
|
||||
);
|
||||
|
||||
CREATE INDEX "posts_fti_idx" ON posts USING GIN(content_tsv);
|
||||
CREATE TRIGGER "posts_fti_trigger" AFTER UPDATE OR INSERT OR DELETE
|
||||
ON posts FOR EACH ROW
|
||||
EXECUTE PROCEDURE tsvector_update_trigger(posts_fti, content_tsv, 'pg_catalog.english', content);
|
||||
|
||||
CREATE TABLE authors (
|
||||
id INTEGER PRIMARY KEY,
|
||||
url TEXT UNIQUE
|
||||
);
|
||||
|
||||
CREATE TABLE posts_keywords (
|
||||
post_id INTEGER,
|
||||
keyword TEXT,
|
||||
|
||||
PRIMARY KEY(post_id, keyword),
|
||||
CONSTRAINT fk_post FOREIGN KEY(post_id) REFERENCES posts(id)
|
||||
-- CONSTRAINT fk_keyword FOREIGN KEY(keyword) REFERENCES keywords(keyword)
|
||||
);
|
||||
|
||||
CREATE INDEX "posts_keywords_idx" ON posts_keywords(keyword);
|
2
rustfmt.toml
Normal file
2
rustfmt.toml
Normal file
|
@ -0,0 +1,2 @@
|
|||
max_width = 80
|
||||
tab_spaces = 2
|
54
src/crawler.rs
Normal file
54
src/crawler.rs
Normal file
|
@ -0,0 +1,54 @@
|
|||
//! Crawler
|
||||
//! ===
|
||||
//!
|
||||
//! The crawler design is built upon an important Postgres feature: Pubsub. It always waits on 2
|
||||
//! events:
|
||||
//!
|
||||
//! - One that listens to Postgres pubsub triggers
|
||||
//! - One that waits on a timer based on the next available job
|
||||
//!
|
||||
//! Once one of these events fires, it retrieves the job from the database and performs the crawl.
|
||||
//! The crawl will try to perform a retrieval and populate the database and fill the full text
|
||||
//! index.
|
||||
|
||||
use anyhow::Result;
|
||||
use reqwest::Url;
|
||||
use scraper::{Html, Selector};
|
||||
|
||||
pub async fn crawler() -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Parser)]
|
||||
pub struct CrawlSingleSiteOpts {
|
||||
/// The domain to crawl
|
||||
#[clap(name = "domain", long)]
|
||||
domain: String,
|
||||
}
|
||||
|
||||
/// Crawl a single site.
|
||||
pub async fn crawl_single_site(opts: CrawlSingleSiteOpts) -> Result<()> {
|
||||
let CrawlSingleSiteOpts { domain, .. } = opts;
|
||||
|
||||
// Check the robots.txt of the site to see if or how we can crawl
|
||||
let url = Url::parse(&format!("https://{domain}/robots.txt"))?;
|
||||
let response = reqwest::get(url).await?;
|
||||
println!("Response: {response:?}");
|
||||
|
||||
// Try to look for an atom feed by parsing HTML headers
|
||||
let url = Url::parse(&format!("https://{domain}"))?;
|
||||
let response = reqwest::get(url).await?;
|
||||
let body = response.text().await?;
|
||||
let html = Html::parse_document(&body);
|
||||
let tag = Selector::parse("link[type=\"application/atom+xml\"]").unwrap();
|
||||
for element in html.select(&tag) {
|
||||
let attrs = element.value().attrs();
|
||||
for attr in attrs {
|
||||
println!("Attrs: {attr:?}");
|
||||
}
|
||||
}
|
||||
|
||||
// Go through each of the files in the atom feed
|
||||
|
||||
Ok(())
|
||||
}
|
47
src/main.rs
Normal file
47
src/main.rs
Normal file
|
@ -0,0 +1,47 @@
|
|||
#[macro_use]
|
||||
extern crate clap;
|
||||
|
||||
mod crawler;
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
|
||||
use crate::crawler::{crawl_single_site, crawler, CrawlSingleSiteOpts};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
struct Opt {
|
||||
#[clap(subcommand)]
|
||||
command: Subcommand,
|
||||
}
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
enum Subcommand {
|
||||
#[clap(name = "crawler")]
|
||||
Crawler,
|
||||
|
||||
#[clap(name = "web")]
|
||||
Web,
|
||||
|
||||
/// Crawl just a single site
|
||||
#[clap(name = "crawl-single-site")]
|
||||
CrawlSingleSite(CrawlSingleSiteOpts),
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let opt = Opt::parse();
|
||||
|
||||
println!("Hello, world! {opt:?}");
|
||||
|
||||
match opt.command {
|
||||
Subcommand::Web => {}
|
||||
Subcommand::Crawler => {
|
||||
crawler().await?;
|
||||
}
|
||||
Subcommand::CrawlSingleSite(opts) => {
|
||||
crawl_single_site(opts).await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
Loading…
Reference in a new issue