uri_split.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. /* <!-- copyright */
  2. /*
  3. * aria2 - The high speed download utility
  4. *
  5. * Copyright (C) 2012 Tatsuhiro Tsujikawa
  6. *
  7. * This program is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 2 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * This program is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. *
  21. * In addition, as a special exception, the copyright holders give
  22. * permission to link the code of portions of this program with the
  23. * OpenSSL library under certain conditions as described in each
  24. * individual source file, and distribute linked combinations
  25. * including the two.
  26. * You must obey the GNU General Public License in all respects
  27. * for all of the code used other than OpenSSL. If you modify
  28. * file(s) with this exception, you may extend this exception to your
  29. * version of the file(s), but you are not obligated to do so. If you
  30. * do not wish to do so, delete this exception statement from your
  31. * version. If you delete this exception statement from all source
  32. * files in the program, then also delete it here.
  33. */
  34. /* copyright --> */
  35. #include "uri_split.h"
  36. #include <stdlib.h>
  37. typedef enum {
  38. URI_BEFORE_SCHEME,
  39. URI_SCHEME,
  40. URI_SCHEME_SLASH1,
  41. URI_SCHEME_SLASH2,
  42. URI_BEFORE_MAYBE_USER,
  43. URI_MAYBE_USER,
  44. URI_BEFORE_MAYBE_PASSWD,
  45. URI_MAYBE_PASSWD,
  46. URI_BEFORE_HOST,
  47. URI_HOST,
  48. URI_BEFORE_IPV6HOST,
  49. URI_IPV6HOST,
  50. URI_AFTER_IPV6HOST,
  51. URI_BEFORE_PORT,
  52. URI_PORT,
  53. URI_PATH,
  54. URI_BEFORE_QUERY,
  55. URI_QUERY,
  56. URI_BEFORE_FRAGMENT,
  57. URI_FRAGMENT
  58. } uri_split_state;
  59. static void uri_set_field(uri_split_result* res, int field, const char* first,
  60. const char* last, const char* uri)
  61. {
  62. if (first) {
  63. res->field_set |= 1 << field;
  64. res->fields[field].off = first - uri;
  65. res->fields[field].len = last - first;
  66. }
  67. }
  68. static int is_digit(char c) { return '0' <= c && c <= '9'; }
  69. int uri_split(uri_split_result* res, const char* uri)
  70. {
  71. int state = URI_BEFORE_SCHEME;
  72. const char *scheme_first = NULL, *scheme_last = NULL, *host_first = NULL,
  73. *host_last = NULL, *path_first = NULL, *path_last = NULL,
  74. *query_first = NULL, *query_last = NULL, *fragment_first = NULL,
  75. *fragment_last = NULL, *user_first = NULL, *user_last = NULL,
  76. *passwd_first = NULL, *passwd_last = NULL, *last_atmark = NULL,
  77. *last_slash = NULL, *p = uri;
  78. int32_t port = -1;
  79. uint8_t flags = 0;
  80. for (; *p; ++p) {
  81. switch (state) {
  82. case URI_BEFORE_SCHEME:
  83. scheme_first = p;
  84. state = URI_SCHEME;
  85. break;
  86. case URI_SCHEME:
  87. if (*p == ':') {
  88. scheme_last = p;
  89. state = URI_SCHEME_SLASH1;
  90. }
  91. break;
  92. case URI_SCHEME_SLASH1:
  93. if (*p == '/') {
  94. state = URI_SCHEME_SLASH2;
  95. }
  96. else {
  97. return -1;
  98. }
  99. break;
  100. case URI_SCHEME_SLASH2:
  101. if (*p == '/') {
  102. state = URI_BEFORE_MAYBE_USER;
  103. }
  104. else {
  105. return -1;
  106. }
  107. break;
  108. case URI_BEFORE_MAYBE_USER:
  109. switch (*p) {
  110. case '@':
  111. case ':':
  112. case '/':
  113. return -1;
  114. case '[':
  115. state = URI_BEFORE_IPV6HOST;
  116. break;
  117. default:
  118. user_first = p;
  119. state = URI_MAYBE_USER;
  120. }
  121. break;
  122. case URI_MAYBE_USER:
  123. switch (*p) {
  124. case '@':
  125. last_atmark = p;
  126. break;
  127. case ':':
  128. user_last = p;
  129. state = URI_BEFORE_MAYBE_PASSWD;
  130. break;
  131. case '[':
  132. if (last_atmark == p - 1) {
  133. user_last = last_atmark;
  134. state = URI_BEFORE_IPV6HOST;
  135. }
  136. else {
  137. return -1;
  138. }
  139. break;
  140. case '/':
  141. case '?':
  142. case '#':
  143. /* It turns out that this is only host or user + host if
  144. last_atmark is not NULL. */
  145. if (last_atmark) {
  146. host_first = last_atmark + 1;
  147. host_last = p;
  148. user_last = last_atmark;
  149. }
  150. else {
  151. host_first = user_first;
  152. host_last = p;
  153. user_first = user_last = NULL;
  154. }
  155. switch (*p) {
  156. case '/':
  157. path_first = last_slash = p;
  158. state = URI_PATH;
  159. break;
  160. case '?':
  161. state = URI_BEFORE_QUERY;
  162. break;
  163. case '#':
  164. state = URI_BEFORE_FRAGMENT;
  165. break;
  166. }
  167. break;
  168. }
  169. break;
  170. case URI_BEFORE_MAYBE_PASSWD:
  171. passwd_first = p;
  172. switch (*p) {
  173. case '@':
  174. passwd_last = last_atmark = p;
  175. state = URI_BEFORE_HOST;
  176. break;
  177. case '/':
  178. return -1;
  179. default:
  180. /* sums up port number in case of port. */
  181. if (is_digit(*p)) {
  182. port = *p - '0';
  183. }
  184. state = URI_MAYBE_PASSWD;
  185. }
  186. break;
  187. case URI_MAYBE_PASSWD:
  188. switch (*p) {
  189. case '@':
  190. passwd_last = last_atmark = p;
  191. /* Passwd confirmed, reset port to -1. */
  192. port = -1;
  193. state = URI_BEFORE_HOST;
  194. break;
  195. case '[':
  196. return -1;
  197. case '/':
  198. case '?':
  199. case '#':
  200. /* This is port not password. port is in [passwd_first, p) */
  201. if (port == -1) {
  202. return -1;
  203. }
  204. if (last_atmark) {
  205. host_first = last_atmark + 1;
  206. host_last = passwd_first - 1;
  207. user_last = last_atmark;
  208. }
  209. else {
  210. host_first = user_first;
  211. host_last = passwd_first - 1;
  212. user_first = user_last = NULL;
  213. }
  214. passwd_first = passwd_last = NULL;
  215. switch (*p) {
  216. case '/':
  217. path_first = last_slash = p;
  218. state = URI_PATH;
  219. break;
  220. case '?':
  221. state = URI_BEFORE_QUERY;
  222. break;
  223. case '#':
  224. state = URI_BEFORE_FRAGMENT;
  225. break;
  226. }
  227. break;
  228. default:
  229. if (port != -1) {
  230. if (is_digit(*p)) {
  231. port *= 10;
  232. port += *p - '0';
  233. if (port > UINT16_MAX) {
  234. port = -1;
  235. }
  236. }
  237. else {
  238. port = -1;
  239. }
  240. }
  241. break;
  242. }
  243. break;
  244. case URI_BEFORE_HOST:
  245. switch (*p) {
  246. case ':':
  247. case '/':
  248. return -1;
  249. case '[':
  250. state = URI_BEFORE_IPV6HOST;
  251. break;
  252. default:
  253. host_first = p;
  254. state = URI_HOST;
  255. break;
  256. }
  257. break;
  258. case URI_HOST:
  259. switch (*p) {
  260. case ':':
  261. host_last = p;
  262. state = URI_BEFORE_PORT;
  263. break;
  264. case '/':
  265. host_last = path_first = last_slash = p;
  266. state = URI_PATH;
  267. break;
  268. case '?':
  269. host_last = p;
  270. state = URI_BEFORE_QUERY;
  271. break;
  272. case '#':
  273. host_last = p;
  274. state = URI_BEFORE_FRAGMENT;
  275. break;
  276. }
  277. break;
  278. case URI_BEFORE_IPV6HOST:
  279. if (*p == ']') {
  280. return -1;
  281. }
  282. host_first = p;
  283. state = URI_IPV6HOST;
  284. break;
  285. case URI_IPV6HOST:
  286. if (*p == ']') {
  287. flags |= USF_IPV6ADDR;
  288. host_last = p;
  289. state = URI_AFTER_IPV6HOST;
  290. }
  291. break;
  292. case URI_AFTER_IPV6HOST:
  293. switch (*p) {
  294. case ':':
  295. state = URI_BEFORE_PORT;
  296. break;
  297. case '/':
  298. path_first = last_slash = p;
  299. state = URI_PATH;
  300. break;
  301. case '?':
  302. state = URI_BEFORE_QUERY;
  303. break;
  304. case '#':
  305. state = URI_BEFORE_FRAGMENT;
  306. break;
  307. default:
  308. return -1;
  309. }
  310. break;
  311. case URI_BEFORE_PORT:
  312. if (is_digit(*p)) {
  313. port = *p - '0';
  314. state = URI_PORT;
  315. }
  316. else {
  317. return -1;
  318. }
  319. break;
  320. case URI_PORT:
  321. switch (*p) {
  322. case '/':
  323. path_first = last_slash = p;
  324. state = URI_PATH;
  325. break;
  326. case '?':
  327. state = URI_BEFORE_QUERY;
  328. break;
  329. case '#':
  330. state = URI_BEFORE_FRAGMENT;
  331. break;
  332. default:
  333. if (is_digit(*p)) {
  334. port *= 10;
  335. port += *p - '0';
  336. if (port > UINT16_MAX) {
  337. return -1;
  338. }
  339. }
  340. else {
  341. return -1;
  342. }
  343. }
  344. break;
  345. case URI_PATH:
  346. switch (*p) {
  347. case '/':
  348. last_slash = p;
  349. break;
  350. case '?':
  351. path_last = p;
  352. state = URI_BEFORE_QUERY;
  353. break;
  354. case '#':
  355. path_last = p;
  356. state = URI_BEFORE_FRAGMENT;
  357. break;
  358. }
  359. break;
  360. case URI_BEFORE_QUERY:
  361. query_first = p;
  362. if (*p == '#') {
  363. query_last = p;
  364. state = URI_BEFORE_FRAGMENT;
  365. }
  366. else {
  367. state = URI_QUERY;
  368. }
  369. break;
  370. case URI_QUERY:
  371. if (*p == '#') {
  372. query_last = p;
  373. state = URI_BEFORE_FRAGMENT;
  374. }
  375. break;
  376. case URI_BEFORE_FRAGMENT:
  377. fragment_first = p;
  378. state = URI_FRAGMENT;
  379. break;
  380. case URI_FRAGMENT:
  381. break;
  382. }
  383. }
  384. /* Handle premature states */
  385. switch (state) {
  386. case URI_BEFORE_SCHEME:
  387. case URI_SCHEME:
  388. case URI_SCHEME_SLASH1:
  389. case URI_SCHEME_SLASH2:
  390. return -1;
  391. case URI_BEFORE_MAYBE_USER:
  392. return -1;
  393. case URI_MAYBE_USER:
  394. if (last_atmark) {
  395. host_first = last_atmark + 1;
  396. host_last = p;
  397. if (host_first == host_last) {
  398. return -1;
  399. }
  400. user_last = last_atmark;
  401. }
  402. else {
  403. host_first = user_first;
  404. host_last = p;
  405. user_first = user_last = NULL;
  406. }
  407. break;
  408. case URI_BEFORE_MAYBE_PASSWD:
  409. return -1;
  410. case URI_MAYBE_PASSWD:
  411. if (port == -1) {
  412. return -1;
  413. }
  414. if (last_atmark) {
  415. host_first = last_atmark + 1;
  416. host_last = passwd_first - 1;
  417. user_last = last_atmark;
  418. }
  419. else {
  420. host_first = user_first;
  421. host_last = passwd_first - 1;
  422. user_first = user_last = NULL;
  423. }
  424. passwd_first = passwd_last = NULL;
  425. break;
  426. case URI_BEFORE_HOST:
  427. return -1;
  428. case URI_HOST:
  429. host_last = p;
  430. break;
  431. case URI_BEFORE_IPV6HOST:
  432. case URI_IPV6HOST:
  433. return -1;
  434. case URI_AFTER_IPV6HOST:
  435. break;
  436. case URI_BEFORE_PORT:
  437. return -1;
  438. case URI_PORT:
  439. if (port == -1) {
  440. return -1;
  441. }
  442. break;
  443. case URI_PATH:
  444. path_last = p;
  445. break;
  446. case URI_BEFORE_QUERY:
  447. query_first = query_last = p;
  448. break;
  449. case URI_QUERY:
  450. query_last = p;
  451. break;
  452. case URI_BEFORE_FRAGMENT:
  453. fragment_first = fragment_last = p;
  454. break;
  455. case URI_FRAGMENT:
  456. fragment_last = p;
  457. break;
  458. default:
  459. return -1;
  460. };
  461. if (res) {
  462. res->field_set = 0;
  463. res->port = 0;
  464. res->flags = flags;
  465. uri_set_field(res, USR_SCHEME, scheme_first, scheme_last, uri);
  466. uri_set_field(res, USR_HOST, host_first, host_last, uri);
  467. uri_set_field(res, USR_PATH, path_first, path_last, uri);
  468. uri_set_field(res, USR_QUERY, query_first, query_last, uri);
  469. uri_set_field(res, USR_FRAGMENT, fragment_first, fragment_last, uri);
  470. uri_set_field(res, USR_USER, user_first, user_last, uri);
  471. uri_set_field(res, USR_PASSWD, passwd_first, passwd_last, uri);
  472. if (res->field_set & (1 << USR_USER)) {
  473. uri_set_field(res, USR_USERINFO, user_first, last_atmark, uri);
  474. }
  475. if (last_slash && last_slash + 1 != path_last) {
  476. uri_set_field(res, USR_BASENAME, last_slash + 1, path_last, uri);
  477. }
  478. if (port != -1) {
  479. res->field_set |= 1 << USR_PORT;
  480. res->port = port;
  481. }
  482. }
  483. return 0;
  484. }