Un prototype dérivé de urllib3.PoolManager pour améliorer l'accessibilité des données de requêtes (humain & robot). Le but est pas de révolutionner la pratique : curl, wget et le récent http font bien le taf. Ici l'avantage sera principalement la sortie en JSON même si pour l'instant, le traitement est assez sale…
Sortie JSON1) et méthode HEAD par défaut :
$ HTTP dukeart.netlib.re
""
Option -v pour le mode verbeux qui renvoie un objet JSON :
$ HTTP dukeart.netlib.re -v {"request": {"url": "dukeart.netlib.re", "method": "HEAD", "charset": "UTF-8", "headers": {"User-Agent": "HTTP1.0/python3/GNU-Linux"}}, "response": {"status": 200, "time": 0.0625386, "headers": {"Cache-Control": "max-age=36000", "Content-Length": "0", "Content-Security-Police": "upgrade-insecure-requests", "Content-Type": "text/html; charset=UTF-8", "Referrer-Policy": "no-referrer-when-downgrade", "Server": "Caddy", "Strict-Transport-Security": "max-age=31536000", "X-Content-Type-Options": "nosniff", "X-Frame-Options": "sameorigin", "Date": "Sat, 14 Nov 2020 09:25:55 GMT"}, "body": "\"\""}, "rss": false}
jq est votre ami :
$ HTTP dukeart.netlib.re -v | jq { "request": { "url": "dukeart.netlib.re", "method": "HEAD", "charset": "UTF-8", "headers": { "User-Agent": "HTTP1.0/python3/GNU-Linux" } }, "response": { "status": 200, "time": 0.0530405, "headers": { "Cache-Control": "max-age=36000", "Content-Length": "0", "Content-Security-Police": "upgrade-insecure-requests", "Content-Type": "text/html; charset=UTF-8", "Referrer-Policy": "no-referrer-when-downgrade", "Server": "Caddy", "Strict-Transport-Security": "max-age=31536000", "X-Content-Type-Options": "nosniff", "X-Frame-Options": "sameorigin", "Date": "Sat, 14 Nov 2020 10:09:55 GMT" }, "body": "" }, "rss": false }
La sélection sur le résultat complet de la requête se fait via jq :
$ HTTP dukeart.netlib.re -v | jq '.request.charset,(.response.headers | .Server,.Date,."Content-Type")' "UTF-8" "Caddy" "Sat, 14 Nov 2020 09:31:11 GMT" "text/html; charset=UTF-8"
La méthode est désignée par l'option -m (–method
) mais peut aussi se glisser avant ou après les URL :
$ HTTP dukeart.netlib.re/sitemap.txt GET https://dukeart.netlib.re/ https://dukeart.netlib.re/info https://dukeart.netlib.re/rss https://dukeart.netlib.re/api https://dukeart.netlib.re/api/rss https://dukeart.netlib.re/api/rss/feeds https://dukeart.netlib.re/api/rss/keys https://dukeart.netlib.re/api/sitemap https://dukeart.netlib.re/api/sitemap/{query} https://dukeart.netlib.re/app [...]
La sélection sur le contenu se fait via les options -t et -a (resp. –tag
et –attribute
) :
$ HTTP dukeart.netlib.re GET -t a | jq [ "<a class=\"nav-link\" href=\"/#\">home</a>", "<a class=\"nav-link\" href=\"/shaarli\">shaarli</a>", "<a class=\"nav-link\" href=\"/wiki\">wiki</a>", "<a class=\"nav-link\" href=\"/app/?sort=time&order=desc\">app</a>", "<a class=\"nav-link\" href=\"/media/?sort=time&order=desc\">media</a>", "<a class=\"nav-link\" href=\"/drawing\">drawing</a>", "<a aria-expanded=\"true\" aria-haspopup=\"false\" class=\"dropdown-link\" data-toggle=\"dropdown\" href=\"/#\" role=\"button\">Tools ▼</a>", "<a class=\"dropdown-itemx\" href=\"/privatebin\">Privatebin</a>", "<a class=\"dropdown-itemx\" href=\"/api\">Custom API</a>", "<a href=\"/shaarli/?do=rss\" style=\"background:transparent\" title=\"RSS\"><img alt=\"RSS\" height=\"26px\" id=\"logo-rss\" src=\"/media/picture/rss.png\" style=\"border-radius:5px\"/></a>", "<a href=\"/info\" style=\"background:transparent\" title=\"info?\"><img alt=\"info\" height=\"26px\" src=\"/media/picture/info.png\" style=\"border-radius:5px\"/></a>" ]
Le sélecteur de contenu (via BeautifulSoup aka bs4) permet aussi de ne récupérer que les valeurs des attributs (option -a) en conjonction avec l'option -t :
$ HTTP dukeart.netlib.re GET -t a -a href | jq [ "/#", "/shaarli", "/wiki", "/app/?sort=time&order=desc", "/media/?sort=time&order=desc", "/drawing", "/#", "/privatebin", "/api", "/shaarli/?do=rss", "/info" ]
Ici on récupère les attributs href des balises a, ou autrement dit les URI disponibles à partir de cette page.
Les balises XML pour les flux RSS sont également parsées par BeautifulSoup en sous-main :
$ HTTP dukeart.netlib.re/rss GET -t pubdate | jq [ "<pubdate>Fri, 13 Nov 2020 10:07:16 +0100</pubdate>", "<pubdate>Fri, 13 Nov 2020 09:53:32 +0100</pubdate>", "<pubdate>Thu, 12 Nov 2020 17:51:51 +0100</pubdate>", "<pubdate>Mon, 09 Nov 2020 08:40:55 +0100</pubdate>", "<pubdate>Mon, 09 Nov 2020 08:24:29 +0100</pubdate>", [...] ]
On peut également extraire plusieurs balises d'un coup :
$ HTTP dukeart.netlib.re/rss GET -t title pubdate | jq [ "<title>dukeart</title>", "<pubdate>Fri, 13 Nov 2020 10:07:16 +0100</pubdate>", "<title>GitHub - httpie/httpie: As easy as /aitch-tee-tee-pie/ 🥧 Modern, user-friendly command-line HTTP client for the API era. JSON support, colors, sessions, downloads, plugins & more. https://twitter.com/httpie</title>", "<pubdate>Fri, 13 Nov 2020 09:53:32 +0100</pubdate>", [...] ]
Méthode POST du type formulaire (clé=valeur) encodé dans l'url:
$ HTTP httpbin.org/post POST test=toto poop=ok | jq { "args": {}, "data": "{\"test\": \"toto\", \"poop\": \"ok\"}", "files": {}, "form": {}, "headers": { "Accept-Encoding": "identity", "Content-Length": "30", "Host": "httpbin.org", "X-Amzn-Trace-Id": "Root=1-5fafe6d2-33c6528861113f113b0c773d" }, "json": { "poop": "ok", "test": "toto" }, "origin": "78.192.4.55", "url": "http://httpbin.org/post" }
Autre exemple en donnant un corps en JSON directement :
$ HTTP httpbin.org/post POST -H Content-Type=application/json -d '{ "this":"ok" }' | jq { "args": {}, "data": "{\"this\": \"ok\"}", "files": {}, "form": {}, "headers": { "Accept-Encoding": "identity", "Content-Length": "14", "Content-Type": "application/json", "Host": "httpbin.org", "X-Amzn-Trace-Id": "Root=1-5fc3cc1e-31a6fe560b9ed2184570edff" }, "json": { "this": "ok" }, "origin": "78.192.4.55", "url": "http://httpbin.org/post" }
#!/usr/bin/python3 # coding: utf-8 import re,sys,json,time from argparse import ArgumentParser # TO AVOID BROKEN PIPES from signal import signal, SIGPIPE, SIG_DFL signal(SIGPIPE,SIG_DFL) try: import urllib3,certifi from bs4 import BeautifulSoup except Exception as e: print(e) print("Python is missing ressources. Try $ sudo pip3 install certifi urllib3") sys.exit(1) CHARSET = ["UTF-8","ISO-8859-1","Latin-1"] # CLASS INTERFACE / OVERLOOOAD IS COMING class Request(urllib3.PoolManager): default_headers = { "User-Agent":"%s1.0/python3/bs4"%(__file__.split("/")[-1]) } def __init__(self,url=None,**kargs): HEADERS = Request.default_headers HEADERS = HEADERS.update(kargs["headers"]) if "headers" in kargs.keys() else HEADERS urllib3.PoolManager.__init__( self, cert_reqs = 'CERT_REQUIRED', ca_certs = certifi.where(), headers = HEADERS ) self.url = url self.method = kargs["method"] if "method" in kargs.keys() else "GET" self.redirect = kargs["redirect"] if "redirect" in kargs.keys() else 1 self.timeout = kargs["timeout"] if "timeout" in kargs.keys() else 10 self.charset = "utf-8" self.response = None self.content = None self.rss = False def get(self,url=None,method=None,tag=None,body=None,parse=0,attr=None,headers={},verbose=0): """ Request method: - url - method (HTTP) - headers (HTTP) - tag (HTML/RSS) - attr (HTML) - body (PAYLOAD for POST,PUT) - parse (PRINT TEXT CONTENT) - verbose (PRINT ALL REQUEST) """ # METHOD/URL OVERWRITING self.method = method if method != None else self.method self.url = url if url != None else self.url # REQUEST HEADERS self.headers.update ( headers ) # ACTUAL REQUEST tx = time.time() if body in [{},None]: self.response = self.request_encode_url( self.method, self.url, timeout = self.timeout, redirect = self.redirect, headers = headers ) else: body = json.dumps(body) self.response = self.request_encode_body( self.method, self.url, timeout = self.timeout, redirect = self.redirect, headers = headers, body = body) tx = time.time()-tx self.response.time = round(tx,7) self.is_rss() # CONTENT TYPE RETRIEVAL content_type = self.response.headers["Content-Type"] charset = re.findall("charset=[^;]+",content_type) if len(charset)>0: self.charset = charset[0].split("=")[1] # CONTENT DECODING THE ASS try: content = self.response.data.decode(self.charset) except Exception as e: self.charset = None # CONTENT DECODING THE ASS - FINAL if not self.charset: for charset in CHARSET: try: content = self.response.data.decode(charset) self.charset = charset except Exception as e: self.charset = "Unknown" # HTML PARSE FOR HTML/RSS CONTENT if any([ content_type.count(x) for x in ["text/html","rss","xml"]]): data = BeautifulSoup(content,"html.parser") if tag: content = [] for xtag in tag: content.append( data.find_all(xtag) ) ncontent = [] for u in zip(*content): ncontent.extend( u ) content = ncontent # OPTION -a: SEARCH FOR A SPECIFIC TAG ATTRIBUTE if attr: # USING A KEY=VALUE SYNTAX TO TARGET A SPECIFIC ATTRIBUTE'S VALUE if attr.count("="): attr,val = attr.split("=") content = [ str(x) for x in content if x.get(attr) and x.get(attr)==val ] # ELSE RETRIEVE ALL TAGS WITH ATTRIBUTE else: content = [ str(x.get(attr)) for x in content if x.get(attr) ] # ELSE RETRIEVE ALL TAGS else: content = [ str(x) for x in content ] # DECODING IS COMING else: # TRY A STRING TYPE OR ELSE JSON content = content.get_text() if type(content)!=str else json.dumps(content) # JSON IS DETECTED elif content_type.count("json"): self.content = content # content = json.loads(content) return self if verbose else self.content # PLAIN TEXT CONTENT elif content_type.count("text/plain"): content = self.response.data.decode(self.charset) self.content = content.strip() return self if verbose else self.content self.content = json.loads(content) if type(content)==str else content return self if verbose else json.dumps(self.content) def get_headers(self): return json.dumps( self.headers ) def get_response_headers(self): return json.dumps( dict(self.response.headers) ) def __str__(self): """ JSON Datagram for the verbose mode """ this = { "request": { "url" : self.url, "method" : self.method, "charset" : self.charset, "headers" : self.headers }, "response": { "status" : self.response.status, "time" : self.response.time, # "size" : len(self.response.data.decode(self.charset)), "headers" : dict(self.response.headers), "body" : self.content }, "rss" : self.rss } return json.dumps( this ) def is_rss(self): self.rss = any(self.response.headers["content-Type"].count(x) for x in [ "xml","rss" ]) return self.rss # CLI INTERFACE if __name__ == '__main__': # OPTION / ARGUMENT PARSER op = ArgumentParser(description="HTTP Request command") op.add_argument(dest="url",nargs="+") op.add_argument("-m","--method", type=str, default="GET") op.add_argument("-t","--tag", type=str, nargs="+") op.add_argument("-a","--attribute",type=str) op.add_argument("-p","--parse", action="store_true") op.add_argument("-H","--header", type=str, nargs="*") op.add_argument("-d","--data", type=str, nargs="*") op.add_argument("-v","--verbose", action="store_true") op.add_argument("-r","--rssfile", type=str, default="/home/duke/document/rss.txt") args = op.parse_args() req = Request() # EASY NOTATION: HTTP METHOD IS CATCHED AND REMOVED FROM URL LIST bdata = {} urls = [] for url in args.url: if url in ["HEAD","OPTIONS","GET","POST","PUT","DELETE"]: args.method = url elif url.count("=") and not url.startswith("http"): k,v = url.split("=")[0],"=".join(url.split("=")[1:]) bdata.update({k:v}) else: urls.append( url ) # HEADERS CONTRUCTION headers = {} if args.header: for k,v in [ x.split("=") for x in args.header ]: headers.update({k:v}) # TEST PAYLOAD PRESENCE try: bdata = json.loads(args.data) except: pass try: headers.update({"Content-Type":"application/x-www-urlencoded"}) urls = [ url+"?"+"&".join(args.data) for url in urls ] except: pass # ALL IN ONE for url in urls: response = req.get( url = url, method = args.method, headers = headers, tag = args.tag, parse = args.parse, attr = args.attribute, body = bdata, verbose = args.verbose) if req.rss: with open(args.rssfile,"a") as fd: fd.write("%s\n"%req.url) print( response )