dataLoad.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566
  1. #!/usr/bin/env python3
  2. from PIL import Image
  3. from pprint import pprint
  4. import sys
  5. import time
  6. import os
  7. import requests
  8. import random
  9. import json
  10. import argparse
  11. import subprocess
  12. import imager
  13. parser = argparse.ArgumentParser(description="S0urce.io utility program.")
  14. parser.add_argument("--download", help="Download Images", action="store_true")
  15. parser.add_argument("--train", help="Convert Images to Text", action="store_true")
  16. parser.add_argument("--quick", help="Quick convert Images to Text", action="store_true")
  17. parser.add_argument("--update", help="Update s0urce.js script", action="store_true")
  18. parser.add_argument("JSON", type=str, nargs="?", help="Filename to save results", default="test.js")
  19. args = parser.parse_args()
  20. # pprint(args)
  21. # If no option, display help and exit
  22. if ( not args.download and not args.train and not args.quick and not args.update ):
  23. parser.print_help()
  24. sys.exit()
  25. # httpbin.org/headers
  26. sess = requests.Session()
  27. head = {
  28. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
  29. }
  30. sess.headers.update(head)
  31. ON = "X" # Dark pixel in an image
  32. OFF = "." # Light pixel in an image
  33. DIR = (
  34. "data"
  35. ) # Data directory name, do we really need this? Is it really going to change?
  36. INTENSITY = (
  37. 75
  38. ) # How bright does something have to be to trigger it being a dark or light pixel?
  39. # Looks like around 75 removes the extra stuff that s0urce.io does to prevent it from being just matching images.
  40. GREEN_DIFF = 10
  41. # How much brighter the green channel must be (compared to the others),
  42. # to be called green.
  43. VALID_WORDS = {
  44. "constructor", "info", "anon", "send", "com", "root", "port", "val",
  45. "add", "ghost", "net", "http", "status", "syscall", "part", "delete",
  46. "datatype", "loadbytes", "setping", "size", "system", "setstats",
  47. "join", "socket", "signal", "dir", "accountname", "decryptfile",
  48. "intel", "xml", "connect", "sizeof", "writefile", "call", "reset",
  49. "global", "user", "remove", "count", "set", "loop", "num",
  50. "client", "file", "channel", "right", "stat", "emit", "handle",
  51. "buffer", "mysql", "write", "type", "list", "temp", "getfile",
  52. "thread", "decrypt", "poly", "setcookie", "domain", "length",
  53. "gridwidth", "upload", "get", "generatecodepack", "data",
  54. "process", "download", "proxy", "fillgrid", "bit", "encryptfile",
  55. "host", "ping", "event", "url", "load", "key", "changepassword",
  56. "bufferpingset", "getfirewallchannel", "getinfo", "getping", "pass",
  57. "newserver", "username", "generate", "userport", "init", "net",
  58. "left", "point", "cookies", "protocol", "responder", "getkey",
  59. "hostserver", "eventtype", "gridheight", "server", "setport",
  60. "getpass", "loadloggedpassword", "destroybatch", "getxmlprotocol",
  61. "channelsetpackage", "batchallfiles", "module", "response",
  62. "serverproxy", "filetype", "urlcheck", "config", "number",
  63. "ghostfilesystem", "disconnectserver", "emitconfiglist",
  64. "dodecahedron", "eventlistdir", "systemportkey", "setnewproxy",
  65. "createnewsocket", "changeusername", "tempdatapass", "blockthreat",
  66. "statusofprocess", "patcheventlog", "newline", "dir", "bytes",
  67. "findpackage", "package", "encode", "joinnetworkclient",
  68. "rootcookieset", "callmodule", "sizeofhexagon", "createfilethread",
  69. "includedirectory", "loadregisterlist", "encryptunpackedbatch",
  70. "getpartoffile", "getdatapassword", "create2axisvector",
  71. "create3axisvector", "disconnectchannel", "setnewid", "hexagon",
  72. "account", "removenewcookie", "getid", "encodenewfolder",
  73. "sendintelpass", "getlog", "command", "threat", "userid",
  74. "wordcounter", "removeoldcookie", "hostnewserver", "disconnect",
  75. "listconfig", "newhost", "createnewpackage", "loadaltevent", "log",
  76. "filedir", "fileexpresslog", "decryptdatabatch", "mergesocket",
  77. "unpacktmpfile", "uploaduserstats", "getmysqldomain",
  78. "checkhttptype", "encrypt", "vector", "httpbuffersize",
  79. "systemgridtype", "password", "respondertimeout", "deleteallids",
  80. "exportconfigpackage", "export"
  81. }
  82. # Check the environment, do we have all that we need?
  83. if not os.path.exists("images"):
  84. os.mkdir("images")
  85. if not os.path.exists("data"):
  86. os.mkdir("data")
  87. if not os.path.exists("words.txt"):
  88. with open('words.txt', 'w') as f: # Create a empty file
  89. f.write('')
  90. def image_filename(difficulty, index):
  91. return f"images/{difficulty}_{index}.png"
  92. def cleaned_filename(difficulty, index):
  93. return f"images/{difficulty}_{index}_clean.png"
  94. def cleaner_filename(difficulty, index):
  95. return f"images/{difficulty}_{index}_cleaner.png"
  96. def download(howhard, index):
  97. global sess
  98. """
  99. Download an image based upon how hard it is.
  100. On success, it saves the image file.
  101. Failure raises ConnectionError.
  102. Don't leave stale cleaned images around.
  103. """
  104. r = sess.get(f"http://s0urce.io/client/img/word/{howhard}/{index}")
  105. if r.status_code == 200:
  106. # DRY
  107. with open( image_filename(howhard, index), "wb") as f:
  108. f.write(r.content)
  109. # cleaned images? we need to delete & regenerate those.
  110. cleaned = cleaned_filename(howhard, index)
  111. if os.path.exists(cleaned):
  112. os.remove(cleaned)
  113. else:
  114. # We did not get a 200 Okay, log this... Hmm maybe we need to make a log file?
  115. # print( f'{howhard}_{index}.png ' + str(r.status_code) )
  116. raise ConnectionError(
  117. "http://s0urce.io/client/img/word/{0}/{1} returned status_code {2}".format(
  118. howhard, index, r.status_code
  119. )
  120. )
  121. def run(difficult, index):
  122. """
  123. run, represents a single execution of components to the image, (Actuall we do it 1 category at a time instead of just 1 single execution )
  124. those components do the following... (Each category has around 70 items so we standardize on 70, but )
  125. (not all of the categories have 70 and thus we print a File does not exist)
  126. We open and load the image, and get it's size,
  127. then we scan_img for dark and light pixels, <-- This narrows the image down to just the majority of dark pixels
  128. then from that we output the image line by line onto the screen after it has been output_image d into list form,
  129. Where we ask the user what the word is, and after that we save all that to a file in the data directory.
  130. """
  131. for x in range(0, 70):
  132. fname = image_filename(difficult, x)
  133. if not os.path.exists(fname):
  134. # print("Could not find '{0}'".format(fname))
  135. # continue
  136. # We've reached the end, so stop looking. :P
  137. break
  138. print(f"Loading: {fname}")
  139. im = Image.open(fname)
  140. pix = im.load()
  141. size = im.size
  142. print(f"Size: {size[0]} x {size[1]}")
  143. pal = im.getpalette()
  144. sx = 0
  145. ex = size[0]
  146. sy = 0
  147. ey = size[1]
  148. total = 0
  149. sx, sy, ex, ey, total = scan_img(pix, size)
  150. print(f"Chars within ({sx}, {sy}) - ({ex}, {ey}) total {total} pixels")
  151. img_s = output_image(pix, size)
  152. for l in img_s:
  153. print(l)
  154. word = input("Word: ")
  155. # Returns word so it can be stored in dictonary
  156. return word
  157. #print(f"Image saved to '{DIR}/{difficult}_{x}.txt' in byte string")
  158. # os.remove(f'{fname}') # Grr No bad bean, keep file for error checking
  159. # print(f"File '{fname}' automatically removed")
  160. key_word = {}
  161. misery = {}
  162. def quicktrain(difficult):
  163. """
  164. Quickly convert the images to text based upon filesize and image size.
  165. """
  166. quick = {
  167. "100_24_7874": "client",
  168. "100_24_7897": "status",
  169. "100_24_7929": "vector",
  170. "100_24_7937": "encode",
  171. "101_24_7761": "getkey",
  172. "101_24_7906": "server",
  173. "101_24_7922": "module",
  174. "101_24_7968": "socket",
  175. "101_24_7984": "config",
  176. "102_24_7983": "export",
  177. "102_24_7987": "number",
  178. "102_24_7997": "buffer",
  179. "102_24_8000": "getlog",
  180. "102_24_8010": "length",
  181. "102_24_8019": "global",
  182. "102_24_8075": "delete",
  183. "102_24_8087": "domain",
  184. "103_24_8085": "remove",
  185. "103_24_8099": "upload",
  186. "103_24_8102": "sizeof",
  187. "103_24_8122": "system",
  188. "103_24_8139": "threat",
  189. "103_24_8159": "userid",
  190. "104_24_8146": "thread",
  191. "114_24_9012": "getpass",
  192. "115_24_8980": "filedir",
  193. "115_24_9063": "account",
  194. "115_24_9113": "cookies",
  195. "116_24_9036": "newline",
  196. "116_24_9072": "getfile",
  197. "116_24_9089": "newhost",
  198. "116_24_9090": "process",
  199. "116_24_9122": "channel",
  200. "116_24_9136": "connect",
  201. "117_24_9101": "setping",
  202. "117_24_9168": "encrypt",
  203. "117_24_9176": "decrypt",
  204. "117_24_9182": "setport",
  205. "117_24_9248": "package",
  206. "117_24_9297": "hexagon",
  207. "118_24_9280": "getinfo",
  208. "118_24_9310": "getping",
  209. "119_24_9300": "syscall",
  210. "119_24_9321": "command",
  211. "131_24_10113": "generate",
  212. "131_24_10275": "userport",
  213. "132_24_10329": "download",
  214. "132_24_10342": "datatype",
  215. "132_24_10356": "username",
  216. "132_24_10379": "filetype",
  217. "132_24_10395": "protocol",
  218. "132_24_10404": "urlcheck",
  219. "133_24_10292": "response",
  220. "134_24_10420": "setstats",
  221. "134_24_10531": "setnewid",
  222. "134_24_10557": "password",
  223. "136_24_10707": "fillgrid",
  224. "145_24_11396": "loadbytes",
  225. "146_24_11393": "writefile",
  226. "147_24_11595": "setcookie",
  227. "148_24_11374": "eventtype",
  228. "148_24_11610": "newserver",
  229. "148_24_11626": "responder",
  230. "149_24_11614": "gridwidth",
  231. "161_24_12648": "hostserver",
  232. "163_24_12693": "listconfig",
  233. "163_24_12750": "callmodule",
  234. "164_24_12800": "disconnect",
  235. "164_24_12835": "gridheight",
  236. "176_24_13510": "mergesocket",
  237. "177_24_13789": "wordcounter",
  238. "177_24_13819": "accountname",
  239. "177_24_13824": "encryptfile",
  240. "177_24_13855": "serverproxy",
  241. "177_24_13871": "decryptfile",
  242. "177_24_13873": "constructor",
  243. "178_24_13825": "findpackage",
  244. "179_24_13843": "blockthreat",
  245. "179_24_14001": "setnewproxy",
  246. "192_24_14660": "dodecahedron",
  247. "192_24_15084": "destroybatch",
  248. "193_24_14925": "tempdatapass",
  249. "193_24_14979": "eventlistdir",
  250. "194_24_15129": "deleteallids",
  251. "195_24_15252": "loadaltevent",
  252. "207_24_16020": "batchallfiles",
  253. "207_24_16074": "sendintelpass",
  254. "208_24_16209": "getpartoffile",
  255. "208_24_16274": "unpacktmpfile",
  256. "208_24_16287": "hostnewserver",
  257. "208_24_16317": "systemportkey",
  258. "209_24_15651": "rootcookieset",
  259. "209_24_16185": "bufferpingset",
  260. "209_24_16243": "sizeofhexagon",
  261. "209_24_16338": "patcheventlog",
  262. "209_24_16357": "checkhttptype",
  263. "223_24_17412": "changeusername",
  264. "223_24_17418": "systemgridtype",
  265. "224_24_17265": "fileexpresslog",
  266. "224_24_17287": "getmysqldomain",
  267. "224_24_17484": "getxmlprotocol",
  268. "224_24_17487": "httpbuffersize",
  269. "224_24_17506": "emitconfiglist",
  270. "225_24_17005": "changepassword",
  271. "237_24_18534": "uploaduserstats",
  272. "238_24_18712": "encodenewfolder",
  273. "239_24_18670": "ghostfilesystem",
  274. "239_24_18700": "getdatapassword",
  275. "239_24_18710": "statusofprocess",
  276. "239_24_18713": "removeoldcookie",
  277. "239_24_18744": "removenewcookie",
  278. "241_24_18811": "createnewsocket",
  279. "253_24_19681": "generatecodepack",
  280. "254_24_19585": "createnewpackage",
  281. "254_24_19791": "disconnectserver",
  282. "254_24_19901": "decryptdatabatch",
  283. "255_24_19874": "includedirectory",
  284. "255_24_19938": "loadregisterlist",
  285. "256_24_19399": "createfilethread",
  286. "256_24_19577": "respondertimeout",
  287. "268_24_20945": "channelsetpackage",
  288. "268_24_20953": "disconnectchannel",
  289. "269_24_20857": "create2axisvector",
  290. "270_24_21037": "create3axisvector",
  291. "271_24_21016": "joinnetworkclient",
  292. "285_24_22237": "getfirewallchannel",
  293. "288_24_22241": "loadloggedpassword",
  294. "300_24_23305": "exportconfigpackage",
  295. "314_24_24079": "encryptunpackedbatch",
  296. "52_24_4127": "xml",
  297. "52_24_4170": "val",
  298. "53_24_4188": "url",
  299. "54_24_4316": "net",
  300. "54_24_4350": "key",
  301. "55_24_4381": "log",
  302. "55_24_4393": "set",
  303. "55_24_4417": "dir",
  304. "56_24_4487": "get",
  305. "57_24_4504": "num",
  306. "57_24_4524": "com",
  307. "57_24_4553": "bit",
  308. "57_24_4593": "add",
  309. "60_24_4770": "add",
  310. "68_24_5431": "http",
  311. "69_24_5525": "pass",
  312. "70_24_5392": "temp",
  313. "70_24_5444": "type",
  314. "70_24_5469": "list",
  315. "70_24_5529": "file",
  316. "70_24_5545": "loop",
  317. "70_24_5579": "ping",
  318. "70_24_5591": "port",
  319. "71_24_5544": "left",
  320. "71_24_5572": "size",
  321. "71_24_5610": "call",
  322. "71_24_5612": "root",
  323. "71_24_5632": "part",
  324. "71_24_5644": "init",
  325. "71_24_5649": "host",
  326. "71_24_5656": "poly",
  327. "71_24_5667": "info",
  328. "71_24_5748": "user",
  329. "72_24_5559": "join",
  330. "72_24_5699": "",
  331. "72_24_5753": "data",
  332. "73_24_5678": "stat",
  333. "74_24_5861": "send",
  334. "75_24_6004": "load",
  335. "83_24_6579": "proxy",
  336. "84_24_6417": "event",
  337. "85_24_6661": "intel",
  338. "85_24_6693": "right",
  339. "85_24_6735": "bytes",
  340. "86_24_6678": "mysql",
  341. "86_24_6762": "write",
  342. "87_24_6800": "ghost",
  343. "87_24_6911": "count",
  344. "88_24_6908": "reset",
  345. "88_24_6946": "point",
  346. "88_24_7003": "getid",
  347. "99_24_7741": "signal",
  348. "99_24_7796": "handle"
  349. }
  350. for x in range(0, 70):
  351. fname = image_filename(difficult, x)
  352. if not os.path.exists(fname):
  353. break
  354. # Output the image
  355. im = Image.open(fname)
  356. # imager.output_image(im)
  357. size = im.size
  358. filesize = os.path.getsize(fname)
  359. k = "{0}_{1}_{2}".format(*size, os.path.getsize(fname))
  360. if k in quick:
  361. correct = quick[k]
  362. if(correct == ''):
  363. imager.output_image(im)
  364. correct = input("Word: ")
  365. else:
  366. imager.output_image(im)
  367. print("well, shit.")
  368. sys.exit(5)
  369. print(fname, correct)
  370. key_word[f'{difficult}_{x}'] = correct
  371. # misery[correct] = { 'img_size': "{0},{1}".format(*size), 'filesize': os.path.getsize(fname) }
  372. misery[correct] = "{0}_{1}_{2}".format(*size, os.path.getsize(fname))
  373. def autotrain(difficult):
  374. """
  375. run, represents a single execution of components to the image, (Actuall we do it 1 category at a time instead of just 1 single execution )
  376. those components do the following... (Each category has around 70 items so we standardize on 70, but )
  377. (not all of the categories have 70 and thus we print a File does not exist)
  378. We open and load the image, and get it's size,
  379. then we scan_img for dark and light pixels, <-- This narrows the image down to just the majority of dark pixels
  380. then from that we output the image line by line onto the screen after it has been output_image d into list form,
  381. Where we ask the user what the word is, and after that we save all that to a file in the data directory.
  382. """
  383. for x in range(0, 70):
  384. fname = image_filename(difficult, x)
  385. if not os.path.exists(fname):
  386. break
  387. # print("Could not find '{0}'".format(fname))
  388. # continue
  389. cleaned = cleaned_filename(difficult, x)
  390. if not os.path.exists(cleaned):
  391. imager.image_cleaner(fname, cleaned)
  392. print(f"Loading: {cleaned} ", end='')
  393. fileout = "data/{0}_{1}".format(difficult, x)
  394. output = subprocess.run(
  395. ["tesseract", cleaned, fileout],
  396. stderr=subprocess.DEVNULL,
  397. # capture_output=False,
  398. shell=False,
  399. )
  400. with open(fileout + ".txt", "r") as fp:
  401. word = fp.read().strip().lower()
  402. if (word != '') and (word in VALID_WORDS):
  403. key_word[f'{difficult}_{x}'] = word
  404. im = Image.open(fname)
  405. # imager.output_image(im)
  406. size = im.size
  407. with open('words.txt', 'a') as f:
  408. f.write(f'{difficult}_{x} = {size[0]} x {size[1]} is {word}\n')
  409. print(word)
  410. if word in misery:
  411. print("Awwww SHIT! ", word)
  412. misery["{0}_{1}_{2}".format(*size, os.path.getsize(fname))] = word
  413. # misery[word] = { 'img_size': "{0},{1}".format(*size), 'filesize': os.path.getsize(fname) }
  414. else:
  415. print("UNKNOWN", word)
  416. # Output the image
  417. im = Image.open(fname)
  418. imager.output_image(im)
  419. size = im.size
  420. # img_s = output_image(pix, size)
  421. # for l in img_s:
  422. # print(l)
  423. correct = input("Word: ")
  424. key_word[f'{difficult}_{x}'] = correct
  425. # misery[correct] = { 'img_size': "{0},{1}".format(*size), 'filesize': os.path.getsize(fname) }
  426. if correct in misery:
  427. print("Awwww SHIT! ", correct)
  428. misery["{0}_{1}_{2}".format(*size, os.path.getsize(fname))] = correct
  429. with open('words.txt', 'a') as f:
  430. f.write(f'{difficult}_{x} = {size[0]} x {size[1]} is {correct}\n')
  431. with open('misery.json', 'w') as fp:
  432. json.dump(misery, fp, indent=2, sort_keys=True)
  433. # Now to call all the previous functions
  434. if args.download:
  435. print("Downloading s0urce.io Words")
  436. # smaller is better, and cleaner.
  437. tofetch = { 'e': 62, 'm': 66, 'h': 55 }
  438. for d, max in tofetch.items():
  439. print(d.upper())
  440. for i in range(0, max):
  441. download(d, i)
  442. # time.sleep(random.randint(10, 15))
  443. if args.train:
  444. # Img Processing: Run thru every single category and every single word
  445. for level in ["e", "m", "h"]:
  446. autotrain(level)
  447. with open(args.JSON, 'w') as fp:
  448. json.dump(key_word, fp, sort_keys=True, indent=2)
  449. if args.quick:
  450. # Img Processing: Run thru every single category and every single word
  451. for level in ["e", "m", "h"]:
  452. quicktrain(level)
  453. with open(args.JSON, 'w') as fp:
  454. json.dump(key_word, fp, sort_keys=True, indent=2)
  455. if args.update:
  456. with open(args.JSON, 'r') as fp:
  457. key_word = json.load(fp)
  458. # update the s0urce.js script
  459. filename = 's0urce.user.js'
  460. with open(filename, 'r') as fp:
  461. lines = fp.readlines()
  462. # Lines are now in memory. Time to update!
  463. for i in range(0, len(lines)):
  464. if 'http://s0urce.io/client/img/word/' in lines[i]:
  465. # This is a target line, so:
  466. l = lines[i].strip().strip(':').strip('"')
  467. # gets parts of the path
  468. parts = l.split('/')
  469. # get difficulty and index
  470. dif = parts[-2]
  471. index = parts[-1]
  472. # build the key -- get the word
  473. key = f'{dif}_{index}'
  474. # pprint(parts)
  475. # pprint(key)
  476. word = key_word[key]
  477. print("{0} : {1}".format( key, word))
  478. lines[i+1] = f' form.value = "{word}";' + "\n" # break;\n" # You may need it... or may not.
  479. if '@version' in lines[i]:
  480. # Get version number
  481. l = lines[i]
  482. lis = list(l)
  483. # Get values and increment decimal
  484. who = int(lis[-4]) # Whole
  485. dec = int(lis[-2]) # Decimal
  486. dec += 1
  487. if (dec >= 10):
  488. who += 1
  489. dec = 0
  490. # Update the list so it displays the new values
  491. lis[-4] = str(who)
  492. lis[-2] = str(dec)
  493. # Write that all out again
  494. rest = ''
  495. for e in lis:
  496. rest += e
  497. lines[i] = rest
  498. with open(filename, 'w') as fp:
  499. for line in lines:
  500. fp.write(line)
  501. output = subprocess.run(
  502. ["scp", filename, "linode:/usr/share/nginx/bugz"],
  503. stderr=subprocess.DEVNULL,
  504. # capture_output=False,
  505. shell=False,
  506. )
  507. print("Please visit https://bugz.red-green.com/s0urce.user.js !")
  508. # Regardless what we did let the user know we at least ran and we are now done
  509. print("Complete")