dataLoad.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543
  1. #!/usr/bin/env python3
  2. from PIL import Image
  3. from pprint import pprint
  4. import sys
  5. import time
  6. import os
  7. import requests
  8. import random
  9. import json
  10. import argparse
  11. import subprocess
  12. import imager
  13. parser = argparse.ArgumentParser(description="S0urce.io utility program.")
  14. parser.add_argument("--download", help="Download Images", action="store_true")
  15. parser.add_argument("--train", help="Convert Images to Text", action="store_true")
  16. parser.add_argument("--quick", help="Quick convert Images to Text", action="store_true")
  17. parser.add_argument("--update", help="Update s0urce.js script", action="store_true")
  18. parser.add_argument("JSON", type=str, nargs="?", help="Filename to save results", default="test.js")
  19. args = parser.parse_args()
  20. # pprint(args)
  21. # Or perhaps have parser.print_help()
  22. if ( not args.download and not args.train and not args.quick and not args.update ):
  23. parser.error("Need --download, --train, --quick, or --update")
  24. # httpbin.org/headers
  25. sess = requests.Session()
  26. head = {
  27. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
  28. }
  29. sess.headers.update(head)
  30. ON = "X" # Dark pixel in an image
  31. OFF = "." # Light pixel in an image
  32. DIR = (
  33. "data"
  34. ) # Data directory name, do we really need this? Is it really going to change?
  35. INTENSITY = (
  36. 75
  37. ) # How bright does something have to be to trigger it being a dark or light pixel?
  38. # Looks like around 75 removes the extra stuff that s0urce.io does to prevent it from being just matching images.
  39. GREEN_DIFF = 10
  40. # How much brighter the green channel must be (compared to the others),
  41. # to be called green.
  42. VALID_WORDS = {
  43. "constructor", "info", "anon", "send", "com", "root", "port", "val",
  44. "add", "ghost", "net", "http", "status", "syscall", "part", "delete",
  45. "datatype", "loadbytes", "setping", "size", "system", "setstats",
  46. "join", "socket", "signal", "dir", "accountname", "decryptfile",
  47. "intel", "xml", "connect", "sizeof", "writefile", "call", "reset",
  48. "global", "user", "remove", "count", "set", "loop", "num",
  49. "client", "file", "channel", "right", "stat", "emit", "handle",
  50. "buffer", "mysql", "write", "type", "list", "temp", "getfile",
  51. "thread", "decrypt", "poly", "setcookie", "domain", "length",
  52. "gridwidth", "upload", "get", "generatecodepack", "data",
  53. "process", "download", "proxy", "fillgrid", "bit", "encryptfile",
  54. "host", "ping", "event", "url", "load", "key", "changepassword",
  55. "bufferpingset", "getfirewallchannel", "getinfo", "getping", "pass",
  56. "newserver", "username", "generate", "userport", "init", "net",
  57. "left", "point", "cookies", "protocol", "responder", "getkey",
  58. "hostserver", "eventtype", "gridheight", "server", "setport",
  59. "getpass", "loadloggedpassword", "destroybatch", "getxmlprotocol",
  60. "channelsetpackage", "batchallfiles", "module", "response",
  61. "serverproxy", "filetype", "urlcheck", "config", "number",
  62. "ghostfilesystem", "disconnectserver", "emitconfiglist",
  63. "dodecahedron", "eventlistdir", "systemportkey", "setnewproxy",
  64. "createnewsocket", "changeusername", "tempdatapass", "blockthreat",
  65. "statusofprocess", "patcheventlog", "newline", "dir", "bytes",
  66. "findpackage", "package", "encode", "joinnetworkclient",
  67. "rootcookieset", "callmodule", "sizeofhexagon", "createfilethread",
  68. "includedirectory", "loadregisterlist", "encryptunpackedbatch",
  69. "getpartoffile", "getdatapassword", "create2axisvector",
  70. "create3axisvector", "disconnectchannel", "setnewid", "hexagon",
  71. "account", "removenewcookie", "getid", "encodenewfolder",
  72. "sendintelpass", "getlog", "command", "threat", "userid",
  73. "wordcounter", "removeoldcookie", "hostnewserver", "disconnect",
  74. "listconfig", "newhost", "createnewpackage", "loadaltevent", "log",
  75. "filedir", "fileexpresslog", "decryptdatabatch", "mergesocket",
  76. "unpacktmpfile", "uploaduserstats", "getmysqldomain",
  77. "checkhttptype", "encrypt", "vector", "httpbuffersize",
  78. "systemgridtype", "password", "respondertimeout", "deleteallids",
  79. "exportconfigpackage", "export"
  80. }
  81. # Check the environment, do we have all that we need?
  82. if not os.path.exists("images"):
  83. os.mkdir("images")
  84. if not os.path.exists("data"):
  85. os.mkdir("data")
  86. if not os.path.exists("words.txt"):
  87. with open('words.txt', 'w') as f: # Create a empty file
  88. f.write('')
  89. def image_filename(difficulty, index):
  90. return f"images/{difficulty}_{index}.png"
  91. def cleaned_filename(difficulty, index):
  92. return f"images/{difficulty}_{index}_clean.png"
  93. def cleaner_filename(difficulty, index):
  94. return f"images/{difficulty}_{index}_cleaner.png"
  95. def download(howhard, index):
  96. global sess
  97. """
  98. Download an image based upon how hard it is.
  99. On success, it saves the image file.
  100. Failure raises ConnectionError.
  101. Don't leave stale cleaned images around.
  102. """
  103. r = sess.get(f"http://s0urce.io/client/img/word/{howhard}/{index}")
  104. if r.status_code == 200:
  105. # DRY
  106. with open( image_filename(howhard, index), "wb") as f:
  107. f.write(r.content)
  108. # cleaned images? we need to delete & regenerate those.
  109. cleaned = cleaned_filename(howhard, index)
  110. if os.path.exists(cleaned):
  111. os.remove(cleaned)
  112. else:
  113. # We did not get a 200 Okay, log this... Hmm maybe we need to make a log file?
  114. # print( f'{howhard}_{index}.png ' + str(r.status_code) )
  115. raise ConnectionError(
  116. "http://s0urce.io/client/img/word/{0}/{1} returned status_code {2}".format(
  117. howhard, index, r.status_code
  118. )
  119. )
  120. def run(difficult, index):
  121. """
  122. run, represents a single execution of components to the image, (Actuall we do it 1 category at a time instead of just 1 single execution )
  123. those components do the following... (Each category has around 70 items so we standardize on 70, but )
  124. (not all of the categories have 70 and thus we print a File does not exist)
  125. We open and load the image, and get it's size,
  126. then we scan_img for dark and light pixels, <-- This narrows the image down to just the majority of dark pixels
  127. then from that we output the image line by line onto the screen after it has been output_image d into list form,
  128. Where we ask the user what the word is, and after that we save all that to a file in the data directory.
  129. """
  130. for x in range(0, 70):
  131. fname = image_filename(difficult, x)
  132. if not os.path.exists(fname):
  133. # print("Could not find '{0}'".format(fname))
  134. # continue
  135. # We've reached the end, so stop looking. :P
  136. break
  137. print(f"Loading: {fname}")
  138. im = Image.open(fname)
  139. pix = im.load()
  140. size = im.size
  141. print(f"Size: {size[0]} x {size[1]}")
  142. pal = im.getpalette()
  143. sx = 0
  144. ex = size[0]
  145. sy = 0
  146. ey = size[1]
  147. total = 0
  148. sx, sy, ex, ey, total = scan_img(pix, size)
  149. print(f"Chars within ({sx}, {sy}) - ({ex}, {ey}) total {total} pixels")
  150. img_s = output_image(pix, size)
  151. for l in img_s:
  152. print(l)
  153. word = input("Word: ")
  154. # Returns word so it can be stored in dictonary
  155. return word
  156. #print(f"Image saved to '{DIR}/{difficult}_{x}.txt' in byte string")
  157. # os.remove(f'{fname}') # Grr No bad bean, keep file for error checking
  158. # print(f"File '{fname}' automatically removed")
  159. key_word = {}
  160. misery = {}
  161. def quicktrain(difficult):
  162. """
  163. Quickly convert the images to text based upon filesize and image size.
  164. """
  165. quick = {
  166. "100_24_7874": "client",
  167. "100_24_7897": "status",
  168. "100_24_7929": "vector",
  169. "100_24_7937": "encode",
  170. "101_24_7761": "getkey",
  171. "101_24_7906": "server",
  172. "101_24_7922": "module",
  173. "101_24_7968": "socket",
  174. "101_24_7984": "config",
  175. "102_24_7983": "export",
  176. "102_24_7987": "number",
  177. "102_24_7997": "buffer",
  178. "102_24_8000": "getlog",
  179. "102_24_8010": "length",
  180. "102_24_8019": "global",
  181. "102_24_8075": "delete",
  182. "102_24_8087": "domain",
  183. "103_24_8085": "remove",
  184. "103_24_8099": "upload",
  185. "103_24_8102": "sizeof",
  186. "103_24_8122": "system",
  187. "103_24_8139": "threat",
  188. "103_24_8159": "userid",
  189. "104_24_8146": "thread",
  190. "114_24_9012": "getpass",
  191. "115_24_8980": "filedir",
  192. "115_24_9063": "account",
  193. "115_24_9113": "cookies",
  194. "116_24_9036": "newline",
  195. "116_24_9072": "getfile",
  196. "116_24_9089": "newhost",
  197. "116_24_9090": "process",
  198. "116_24_9122": "channel",
  199. "116_24_9136": "connect",
  200. "117_24_9101": "setping",
  201. "117_24_9168": "encrypt",
  202. "117_24_9176": "decrypt",
  203. "117_24_9182": "setport",
  204. "117_24_9248": "package",
  205. "117_24_9297": "hexagon",
  206. "118_24_9280": "getinfo",
  207. "118_24_9310": "getping",
  208. "119_24_9300": "syscall",
  209. "119_24_9321": "command",
  210. "131_24_10113": "generate",
  211. "131_24_10275": "userport",
  212. "132_24_10329": "download",
  213. "132_24_10342": "datatype",
  214. "132_24_10356": "username",
  215. "132_24_10379": "filetype",
  216. "132_24_10395": "protocol",
  217. "132_24_10404": "urlcheck",
  218. "133_24_10292": "response",
  219. "134_24_10420": "setstats",
  220. "134_24_10531": "setnewid",
  221. "134_24_10557": "password",
  222. "136_24_10707": "fillgrid",
  223. "145_24_11396": "loadbytes",
  224. "146_24_11393": "writefile",
  225. "147_24_11595": "setcookie",
  226. "148_24_11374": "eventtype",
  227. "148_24_11610": "newserver",
  228. "148_24_11626": "responder",
  229. "149_24_11614": "gridwidth",
  230. "161_24_12648": "hostserver",
  231. "163_24_12693": "listconfig",
  232. "163_24_12750": "callmodule",
  233. "164_24_12800": "disconnect",
  234. "164_24_12835": "gridheight",
  235. "176_24_13510": "mergesocket",
  236. "177_24_13789": "wordcounter",
  237. "177_24_13819": "accountname",
  238. "177_24_13824": "encryptfile",
  239. "177_24_13855": "serverproxy",
  240. "177_24_13871": "decryptfile",
  241. "177_24_13873": "constructor",
  242. "178_24_13825": "findpackage",
  243. "179_24_13843": "blockthreat",
  244. "179_24_14001": "setnewproxy",
  245. "192_24_14660": "dodecahedron",
  246. "192_24_15084": "destroybatch",
  247. "193_24_14925": "tempdatapass",
  248. "193_24_14979": "eventlistdir",
  249. "194_24_15129": "deleteallids",
  250. "195_24_15252": "loadaltevent",
  251. "207_24_16020": "batchallfiles",
  252. "207_24_16074": "sendintelpass",
  253. "208_24_16209": "getpartoffile",
  254. "208_24_16274": "unpacktmpfile",
  255. "208_24_16287": "hostnewserver",
  256. "208_24_16317": "systemportkey",
  257. "209_24_15651": "rootcookieset",
  258. "209_24_16185": "bufferpingset",
  259. "209_24_16243": "sizeofhexagon",
  260. "209_24_16338": "patcheventlog",
  261. "209_24_16357": "checkhttptype",
  262. "223_24_17412": "changeusername",
  263. "223_24_17418": "systemgridtype",
  264. "224_24_17265": "fileexpresslog",
  265. "224_24_17287": "getmysqldomain",
  266. "224_24_17484": "getxmlprotocol",
  267. "224_24_17487": "httpbuffersize",
  268. "224_24_17506": "emitconfiglist",
  269. "225_24_17005": "changepassword",
  270. "237_24_18534": "uploaduserstats",
  271. "238_24_18712": "encodenewfolder",
  272. "239_24_18670": "ghostfilesystem",
  273. "239_24_18700": "getdatapassword",
  274. "239_24_18710": "statusofprocess",
  275. "239_24_18713": "removeoldcookie",
  276. "239_24_18744": "removenewcookie",
  277. "241_24_18811": "createnewsocket",
  278. "253_24_19681": "generatecodepack",
  279. "254_24_19585": "createnewpackage",
  280. "254_24_19791": "disconnectserver",
  281. "254_24_19901": "decryptdatabatch",
  282. "255_24_19874": "includedirectory",
  283. "255_24_19938": "loadregisterlist",
  284. "256_24_19399": "createfilethread",
  285. "256_24_19577": "respondertimeout",
  286. "268_24_20945": "channelsetpackage",
  287. "268_24_20953": "disconnectchannel",
  288. "269_24_20857": "create2axisvector",
  289. "270_24_21037": "create3axisvector",
  290. "271_24_21016": "joinnetworkclient",
  291. "285_24_22237": "getfirewallchannel",
  292. "288_24_22241": "loadloggedpassword",
  293. "300_24_23305": "exportconfigpackage",
  294. "314_24_24079": "encryptunpackedbatch",
  295. "52_24_4127": "xml",
  296. "52_24_4170": "val",
  297. "53_24_4188": "url",
  298. "54_24_4316": "net",
  299. "54_24_4350": "key",
  300. "55_24_4381": "log",
  301. "55_24_4393": "set",
  302. "55_24_4417": "dir",
  303. "56_24_4487": "get",
  304. "57_24_4504": "num",
  305. "57_24_4524": "com",
  306. "57_24_4553": "bit",
  307. "57_24_4593": "add",
  308. "60_24_4770": "add",
  309. "68_24_5431": "http",
  310. "69_24_5525": "pass",
  311. "70_24_5392": "temp",
  312. "70_24_5444": "type",
  313. "70_24_5469": "list",
  314. "70_24_5529": "file",
  315. "70_24_5545": "loop",
  316. "70_24_5579": "ping",
  317. "70_24_5591": "port",
  318. "71_24_5544": "left",
  319. "71_24_5572": "size",
  320. "71_24_5610": "call",
  321. "71_24_5612": "root",
  322. "71_24_5632": "part",
  323. "71_24_5644": "init",
  324. "71_24_5649": "host",
  325. "71_24_5656": "poly",
  326. "71_24_5667": "info",
  327. "71_24_5748": "user",
  328. "72_24_5559": "join",
  329. "72_24_5699": "anon",
  330. "72_24_5753": "data",
  331. "73_24_5678": "stat",
  332. "74_24_5861": "send",
  333. "75_24_6004": "load",
  334. "83_24_6579": "proxy",
  335. "84_24_6417": "event",
  336. "85_24_6661": "intel",
  337. "85_24_6693": "right",
  338. "85_24_6735": "bytes",
  339. "86_24_6678": "mysql",
  340. "86_24_6762": "write",
  341. "87_24_6800": "ghost",
  342. "87_24_6911": "count",
  343. "88_24_6908": "reset",
  344. "88_24_6946": "point",
  345. "88_24_7003": "getid",
  346. "99_24_7741": "signal",
  347. "99_24_7796": "handle"
  348. }
  349. for x in range(0, 70):
  350. fname = image_filename(difficult, x)
  351. if not os.path.exists(fname):
  352. break
  353. # Output the image
  354. im = Image.open(fname)
  355. # imager.output_image(im)
  356. size = im.size
  357. filesize = os.path.getsize(fname)
  358. k = "{0}_{1}_{2}".format(*size, os.path.getsize(fname))
  359. if k in quick:
  360. correct = quick[k]
  361. else:
  362. imager.output_image(im)
  363. print("well, shit.")
  364. sys.exit(5)
  365. print(fname, correct)
  366. key_word[f'{difficult}_{x}'] = correct
  367. # misery[correct] = { 'img_size': "{0},{1}".format(*size), 'filesize': os.path.getsize(fname) }
  368. misery[correct] = "{0}_{1}_{2}".format(*size, os.path.getsize(fname))
  369. def autotrain(difficult):
  370. """
  371. run, represents a single execution of components to the image, (Actuall we do it 1 category at a time instead of just 1 single execution )
  372. those components do the following... (Each category has around 70 items so we standardize on 70, but )
  373. (not all of the categories have 70 and thus we print a File does not exist)
  374. We open and load the image, and get it's size,
  375. then we scan_img for dark and light pixels, <-- This narrows the image down to just the majority of dark pixels
  376. then from that we output the image line by line onto the screen after it has been output_image d into list form,
  377. Where we ask the user what the word is, and after that we save all that to a file in the data directory.
  378. """
  379. for x in range(0, 70):
  380. fname = image_filename(difficult, x)
  381. if not os.path.exists(fname):
  382. break
  383. # print("Could not find '{0}'".format(fname))
  384. # continue
  385. cleaned = cleaned_filename(difficult, x)
  386. if not os.path.exists(cleaned):
  387. imager.image_cleaner(fname, cleaned)
  388. print(f"Loading: {cleaned} ", end='')
  389. fileout = "data/{0}_{1}".format(difficult, x)
  390. output = subprocess.run(
  391. ["tesseract", cleaned, fileout],
  392. stderr=subprocess.DEVNULL,
  393. # capture_output=False,
  394. shell=False,
  395. )
  396. with open(fileout + ".txt", "r") as fp:
  397. word = fp.read().strip().lower()
  398. if (word != '') and (word in VALID_WORDS):
  399. key_word[f'{difficult}_{x}'] = word
  400. im = Image.open(fname)
  401. # imager.output_image(im)
  402. size = im.size
  403. with open('words.txt', 'a') as f:
  404. f.write(f'{difficult}_{x} = {size[0]} x {size[1]} is {word}\n')
  405. print(word)
  406. if word in misery:
  407. print("Awwww SHIT! ", word)
  408. misery["{0}_{1}_{2}".format(*size, os.path.getsize(fname))] = word
  409. # misery[word] = { 'img_size': "{0},{1}".format(*size), 'filesize': os.path.getsize(fname) }
  410. else:
  411. print("UNKNOWN", word)
  412. # Output the image
  413. im = Image.open(fname)
  414. imager.output_image(im)
  415. size = im.size
  416. # img_s = output_image(pix, size)
  417. # for l in img_s:
  418. # print(l)
  419. correct = input("Word: ")
  420. key_word[f'{difficult}_{x}'] = correct
  421. # misery[correct] = { 'img_size': "{0},{1}".format(*size), 'filesize': os.path.getsize(fname) }
  422. if correct in misery:
  423. print("Awwww SHIT! ", correct)
  424. misery["{0}_{1}_{2}".format(*size, os.path.getsize(fname))] = correct
  425. with open('words.txt', 'a') as f:
  426. f.write(f'{difficult}_{x} = {size[0]} x {size[1]} is {correct}\n')
  427. with open('misery.json', 'w') as fp:
  428. json.dump(misery, fp, indent=2, sort_keys=True)
  429. # Now to call all the previous functions
  430. if args.download:
  431. print("Downloading s0urce.io Words")
  432. # smaller is better, and cleaner.
  433. tofetch = { 'e': 62, 'm': 66, 'h': 55 }
  434. for d, max in tofetch.items():
  435. print(d.upper())
  436. for i in range(0, max):
  437. download(d, i)
  438. # time.sleep(random.randint(10, 15))
  439. if args.train:
  440. # Img Processing: Run thru every single category and every single word
  441. for level in ["e", "m", "h"]:
  442. autotrain(level)
  443. with open(args.JSON, 'w') as fp:
  444. json.dump(key_word, fp, sort_keys=True, indent=2)
  445. if args.quick:
  446. # Img Processing: Run thru every single category and every single word
  447. for level in ["e", "m", "h"]:
  448. quicktrain(level)
  449. with open(args.JSON, 'w') as fp:
  450. json.dump(key_word, fp, sort_keys=True, indent=2)
  451. if args.update:
  452. with open(args.JSON, 'r') as fp:
  453. key_word = json.load(fp)
  454. # update the s0urce.js script
  455. filename = 's0urce.user.js'
  456. with open(filename, 'r') as fp:
  457. lines = fp.readlines()
  458. # Lines are now in memory. Time to update!
  459. for i in range(0, len(lines)):
  460. if 'http://s0urce.io/client/img/word/' in lines[i]:
  461. # This is a target line, so:
  462. l = lines[i].strip().strip(':').strip('"')
  463. # gets parts of the path
  464. parts = l.split('/')
  465. # get difficulty and index
  466. dif = parts[-2]
  467. index = parts[-1]
  468. # build the key -- get the word
  469. key = f'{dif}_{index}'
  470. # pprint(parts)
  471. # pprint(key)
  472. word = key_word[key]
  473. print("{0} : {1}".format( key, word))
  474. lines[i+1] = f' form.value = "{word}";' + "\n" # break;\n" # You may need it... or may not.
  475. with open(filename, 'w') as fp:
  476. for line in lines:
  477. fp.write(line)
  478. output = subprocess.run(
  479. ["scp", filename, "linode:/usr/share/nginx/bugz"],
  480. stderr=subprocess.DEVNULL,
  481. # capture_output=False,
  482. shell=False,
  483. )
  484. print("Please visit https://bugz.red-green.com/s0urce.user.js !")
  485. # Regardless what we did let the user know we at least ran and we are now done
  486. print("Complete")