xref: /haiku/src/add-ons/kernel/network/stack/net_socket.cpp (revision c7509fce9db782326f159843f1b028b5f5dcb1d2)
1 /*
2  * Copyright 2006-2007, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Axel Dörfler, axeld@pinc-software.de
7  */
8 
9 
10 #include "stack_private.h"
11 
12 #include <net_protocol.h>
13 #include <net_stack.h>
14 #include <net_stat.h>
15 
16 #include <KernelExport.h>
17 #include <team.h>
18 #include <util/AutoLock.h>
19 #include <util/list.h>
20 #include <fs/select_sync_pool.h>
21 
22 #include <new>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <sys/time.h>
26 
27 
28 struct net_socket_private : net_socket {
29 	struct list_link		link;
30 	team_id					owner;
31 	uint32					max_backlog;
32 	uint32					child_count;
33 	struct list				pending_children;
34 	struct list				connected_children;
35 
36 	struct select_sync_pool	*select_pool;
37 	benaphore				lock;
38 };
39 
40 
41 void socket_delete(net_socket *socket);
42 int socket_bind(net_socket *socket, const struct sockaddr *address, socklen_t addressLength);
43 
44 struct list sSocketList;
45 benaphore sSocketLock;
46 
47 
48 static status_t
49 create_socket(int family, int type, int protocol, net_socket_private **_socket)
50 {
51 	struct net_socket_private *socket = new (std::nothrow) net_socket_private;
52 	if (socket == NULL)
53 		return B_NO_MEMORY;
54 
55 	memset(socket, 0, sizeof(net_socket_private));
56 	socket->family = family;
57 	socket->type = type;
58 	socket->protocol = protocol;
59 
60 	status_t status = benaphore_init(&socket->lock, "socket");
61 	if (status < B_OK)
62 		goto err1;
63 
64 	// set defaults (may be overridden by the protocols)
65 	socket->send.buffer_size = 65535;
66 	socket->send.low_water_mark = 1;
67 	socket->send.timeout = B_INFINITE_TIMEOUT;
68 	socket->receive.buffer_size = 65535;
69 	socket->receive.low_water_mark = 1;
70 	socket->receive.timeout = B_INFINITE_TIMEOUT;
71 
72 	list_init_etc(&socket->pending_children, offsetof(net_socket_private, link));
73 	list_init_etc(&socket->connected_children, offsetof(net_socket_private, link));
74 
75 	status = get_domain_protocols(socket);
76 	if (status < B_OK)
77 		goto err2;
78 
79 	*_socket = socket;
80 	return B_OK;
81 
82 err2:
83 	benaphore_destroy(&socket->lock);
84 err1:
85 	delete socket;
86 	return status;
87 }
88 
89 
90 //	#pragma mark -
91 
92 
93 status_t
94 socket_open(int family, int type, int protocol, net_socket **_socket)
95 {
96 	net_socket_private *socket;
97 	status_t status = create_socket(family, type, protocol, &socket);
98 	if (status < B_OK)
99 		return status;
100 
101 	status = socket->first_info->open(socket->first_protocol);
102 	if (status < B_OK) {
103 		socket_delete(socket);
104 		return status;
105 	}
106 
107 	socket->owner = team_get_current_team_id();
108 
109 	benaphore_lock(&sSocketLock);
110 	list_add_item(&sSocketList, socket);
111 	benaphore_unlock(&sSocketLock);
112 
113 	*_socket = socket;
114 	return B_OK;
115 }
116 
117 
118 status_t
119 socket_close(net_socket *_socket)
120 {
121 	net_socket_private *socket = (net_socket_private *)_socket;
122 
123 	if (socket->select_pool) {
124 		// notify all pending selects
125 		notify_select_event_pool(socket->select_pool, ~0);
126 	}
127 
128 	return socket->first_info->close(socket->first_protocol);
129 }
130 
131 
132 status_t
133 socket_free(net_socket *socket)
134 {
135 	status_t status = socket->first_info->free(socket->first_protocol);
136 	if (status == B_BUSY)
137 		return B_OK;
138 
139 	socket_delete(socket);
140 	return B_OK;
141 }
142 
143 
144 status_t
145 socket_readv(net_socket *socket, const iovec *vecs, size_t vecCount, size_t *_length)
146 {
147 	return -1;
148 }
149 
150 
151 status_t
152 socket_writev(net_socket *socket, const iovec *vecs, size_t vecCount, size_t *_length)
153 {
154 	if (socket->peer.ss_len == 0)
155 		return ECONNRESET;
156 
157 	if (socket->address.ss_len == 0) {
158 		// try to bind first
159 		status_t status = socket_bind(socket, NULL, 0);
160 		if (status < B_OK)
161 			return status;
162 	}
163 
164 	// TODO: useful, maybe even computed header space!
165 	net_buffer *buffer = gNetBufferModule.create(256);
166 	if (buffer == NULL)
167 		return ENOBUFS;
168 
169 	// copy data into buffer
170 
171 	for (uint32 i = 0; i < vecCount; i++) {
172 		if (gNetBufferModule.append(buffer, vecs[i].iov_base,
173 				vecs[i].iov_len) < B_OK) {
174 			gNetBufferModule.free(buffer);
175 			return ENOBUFS;
176 		}
177 	}
178 
179 	memcpy(&buffer->source, &socket->address, socket->address.ss_len);
180 	memcpy(&buffer->destination, &socket->peer, socket->peer.ss_len);
181 	size_t size = buffer->size;
182 
183 	ssize_t bytesWritten = socket->first_info->send_data(socket->first_protocol,
184 		buffer);
185 	if (bytesWritten < B_OK) {
186 		if (buffer->size != size) {
187 			// this appears to be a partial write
188 			*_length = size - buffer->size;
189 		}
190 		gNetBufferModule.free(buffer);
191 		return bytesWritten;
192 	}
193 
194 	*_length = bytesWritten;
195 	return B_OK;
196 }
197 
198 
199 status_t
200 socket_control(net_socket *socket, int32 op, void *data, size_t length)
201 {
202 	return socket->first_info->control(socket->first_protocol,
203 		LEVEL_DRIVER_IOCTL, op, data, &length);
204 }
205 
206 
207 ssize_t
208 socket_read_avail(net_socket *socket)
209 {
210 	return socket->first_info->read_avail(socket->first_protocol);
211 }
212 
213 
214 ssize_t
215 socket_send_avail(net_socket *socket)
216 {
217 	return socket->first_info->send_avail(socket->first_protocol);
218 }
219 
220 
221 status_t
222 socket_send_data(net_socket *socket, net_buffer *buffer)
223 {
224 	return socket->first_info->send_data(socket->first_protocol,
225 		buffer);
226 }
227 
228 
229 status_t
230 socket_receive_data(net_socket *socket, size_t length, uint32 flags,
231 	net_buffer **_buffer)
232 {
233 	status_t status = socket->first_info->read_data(socket->first_protocol,
234 		length, flags, _buffer);
235 
236 	if (status < B_OK)
237 		return status;
238 
239 	if (*_buffer && length < (*_buffer)->size) {
240 		// discard any data behind the amount requested
241 		gNetBufferModule.trim(*_buffer, length);
242 	}
243 
244 	return status;
245 }
246 
247 
248 status_t
249 socket_get_next_stat(uint32 *_cookie, int family, struct net_stat *stat)
250 {
251 	BenaphoreLocker locker(sSocketLock);
252 
253 	net_socket_private *socket = NULL;
254 	uint32 cookie = *_cookie;
255 	uint32 count = 0;
256 	while ((socket = (net_socket_private *)list_get_next_item(&sSocketList, socket)) != NULL) {
257 		// TODO: also traverse the pending connections
258 		if (count == cookie)
259 			break;
260 
261 		if (family == -1 || family == socket->family)
262 			count++;
263 	}
264 
265 	if (socket == NULL)
266 		return B_ENTRY_NOT_FOUND;
267 
268 	*_cookie = count + 1;
269 
270 	stat->family = socket->family;
271 	stat->type = socket->type;
272 	stat->protocol = socket->protocol;
273 	stat->owner = socket->owner;
274 	stat->state[0] = '\0';
275 	memcpy(&stat->address, &socket->address, sizeof(struct sockaddr_storage));
276 	memcpy(&stat->peer, &socket->peer, sizeof(struct sockaddr_storage));
277 
278 	// fill in protocol specific data (if supported by the protocol)
279 	size_t length = sizeof(net_stat);
280 	socket->first_info->control(socket->first_protocol, socket->protocol,
281 		NET_STAT_SOCKET, stat, &length);
282 
283 	return B_OK;
284 }
285 
286 
287 //	#pragma mark - connections
288 
289 
290 status_t
291 socket_spawn_pending(net_socket *_parent, net_socket **_socket)
292 {
293 	net_socket_private *parent = (net_socket_private *)_parent;
294 
295 	BenaphoreLocker locker(parent->lock);
296 
297 	// We actually accept more pending connections to compensate for those
298 	// that never complete, and also make sure at least a single connection
299 	// can always be accepted
300 	if (parent->child_count > 3 * parent->max_backlog / 2)
301 		return ENOBUFS;
302 
303 	net_socket_private *socket;
304 	status_t status = create_socket(parent->family, parent->type, parent->protocol,
305 		&socket);
306 	if (status < B_OK)
307 		return status;
308 
309 	// inherit parent's properties
310 	socket->send = parent->send;
311 	socket->receive = parent->receive;
312 	socket->options = parent->options & ~SO_ACCEPTCONN;
313 	socket->linger = parent->linger;
314 	socket->owner = parent->owner;
315 	memcpy(&socket->address, &parent->address, parent->address.ss_len);
316 	memcpy(&socket->peer, &parent->peer, parent->peer.ss_len);
317 
318 	// add to the parent's list of pending connections
319 	list_add_item(&parent->pending_children, socket);
320 	socket->parent = parent;
321 	parent->child_count++;
322 
323 	*_socket = socket;
324 	return B_OK;
325 }
326 
327 
328 void
329 socket_delete(net_socket *_socket)
330 {
331 	net_socket_private *socket = (net_socket_private *)_socket;
332 
333 	if (socket->parent != NULL)
334 		panic("socket still has a parent!");
335 
336 	benaphore_lock(&sSocketLock);
337 	list_remove_item(&sSocketList, socket);
338 	benaphore_unlock(&sSocketLock);
339 
340 	put_domain_protocols(socket);
341 	benaphore_destroy(&socket->lock);
342 	delete_select_sync_pool(socket->select_pool);
343 	delete socket;
344 }
345 
346 
347 status_t
348 socket_dequeue_connected(net_socket *_parent, net_socket **_socket)
349 {
350 	net_socket_private *parent = (net_socket_private *)_parent;
351 
352 	benaphore_lock(&parent->lock);
353 
354 	net_socket_private *socket = (net_socket_private *)list_remove_head_item(
355 		&parent->connected_children);
356 	if (socket != NULL) {
357 		socket->parent = NULL;
358 		parent->child_count--;
359 		*_socket = socket;
360 	}
361 
362 	benaphore_unlock(&parent->lock);
363 
364 	if (socket == NULL)
365 		return B_ENTRY_NOT_FOUND;
366 
367 	benaphore_lock(&sSocketLock);
368 	list_add_item(&sSocketList, socket);
369 	benaphore_unlock(&sSocketLock);
370 
371 	return B_OK;
372 }
373 
374 
375 ssize_t
376 socket_count_connected(net_socket *_parent)
377 {
378 	net_socket_private *parent = (net_socket_private *)_parent;
379 
380 	BenaphoreLocker _(parent->lock);
381 
382 	int count = 0;
383 	for (void *it = list_get_first_item(&parent->connected_children);
384 			it != NULL; it = list_get_next_item(&parent->connected_children, it))
385 		count++;
386 
387 	return count;
388 }
389 
390 
391 status_t
392 socket_set_max_backlog(net_socket *_socket, uint32 backlog)
393 {
394 	net_socket_private *socket = (net_socket_private *)_socket;
395 
396 	// we enforce an upper limit of connections waiting to be accepted
397 	if (backlog > 256)
398 		backlog = 256;
399 
400 	benaphore_lock(&socket->lock);
401 
402 	// first remove the pending connections, then the already connected ones as needed
403 	net_socket_private *child;
404 	while (socket->child_count > backlog
405 		&& (child = (net_socket_private *)list_remove_tail_item(&socket->pending_children)) != NULL) {
406 		child->parent = NULL;
407 		socket->child_count--;
408 	}
409 	while (socket->child_count > backlog
410 		&& (child = (net_socket_private *)list_remove_tail_item(&socket->connected_children)) != NULL) {
411 		child->parent = NULL;
412 		socket_delete(child);
413 		socket->child_count--;
414 	}
415 
416 	socket->max_backlog = backlog;
417 	benaphore_unlock(&socket->lock);
418 	return B_OK;
419 }
420 
421 
422 /*!
423 	The socket has been connected. It will be moved to the connected queue
424 	of its parent socket.
425 */
426 status_t
427 socket_connected(net_socket *socket)
428 {
429 	net_socket_private *parent = (net_socket_private *)socket->parent;
430 	if (parent == NULL)
431 		return B_BAD_VALUE;
432 
433 	benaphore_lock(&parent->lock);
434 
435 	list_remove_item(&parent->pending_children, socket);
436 	list_add_item(&parent->connected_children, socket);
437 
438 	// notify parent
439 	if (parent->select_pool)
440 		notify_select_event_pool(parent->select_pool, B_SELECT_READ);
441 
442 	benaphore_unlock(&parent->lock);
443 	return B_OK;
444 }
445 
446 
447 //	#pragma mark - notifications
448 
449 
450 status_t
451 socket_request_notification(net_socket *_socket, uint8 event, uint32 ref,
452 	selectsync *sync)
453 {
454 	net_socket_private *socket = (net_socket_private *)_socket;
455 
456 	benaphore_lock(&socket->lock);
457 
458 	status_t status = add_select_sync_pool_entry(&socket->select_pool, sync,
459 		ref, event);
460 
461 	benaphore_unlock(&socket->lock);
462 
463 	if (status < B_OK)
464 		return status;
465 
466 	// check if the event is already present
467 	// TODO: add support for poll() types
468 
469 	switch (event) {
470 		case B_SELECT_READ:
471 		{
472 			ssize_t available = socket_read_avail(socket);
473 			if ((ssize_t)socket->receive.low_water_mark <= available || available < B_OK)
474 				notify_select_event(sync, ref, event);
475 			break;
476 		}
477 		case B_SELECT_WRITE:
478 		{
479 			ssize_t available = socket_send_avail(socket);
480 			if ((ssize_t)socket->send.low_water_mark <= available || available < B_OK)
481 				notify_select_event(sync, ref, event);
482 			break;
483 		}
484 		case B_SELECT_ERROR:
485 			// TODO: B_SELECT_ERROR condition!
486 			break;
487 	}
488 
489 	return B_OK;
490 }
491 
492 
493 status_t
494 socket_cancel_notification(net_socket *_socket, uint8 event, selectsync *sync)
495 {
496 	net_socket_private *socket = (net_socket_private *)_socket;
497 
498 	benaphore_lock(&socket->lock);
499 
500 	status_t status = remove_select_sync_pool_entry(&socket->select_pool,
501 		sync, event);
502 
503 	benaphore_unlock(&socket->lock);
504 	return status;
505 }
506 
507 
508 status_t
509 socket_notify(net_socket *_socket, uint8 event, int32 value)
510 {
511 	net_socket_private *socket = (net_socket_private *)_socket;
512 	bool notify = true;
513 
514 	switch (event) {
515 		case B_SELECT_READ:
516 			if ((ssize_t)socket->receive.low_water_mark > value && value >= B_OK)
517 				notify = false;
518 			break;
519 
520 		case B_SELECT_WRITE:
521 			if ((ssize_t)socket->send.low_water_mark > value && value >= B_OK)
522 				notify = false;
523 			break;
524 
525 		case B_SELECT_ERROR:
526 			socket->error = value;
527 			break;
528 	}
529 
530 	benaphore_lock(&socket->lock);
531 
532 	if (notify && socket->select_pool)
533 		notify_select_event_pool(socket->select_pool, event);
534 
535 	benaphore_unlock(&socket->lock);
536 	return B_OK;
537 }
538 
539 
540 //	#pragma mark - standard socket API
541 
542 
543 int
544 socket_accept(net_socket *socket, struct sockaddr *address, socklen_t *_addressLength,
545 	net_socket **_acceptedSocket)
546 {
547 	if ((socket->options & SO_ACCEPTCONN) == 0)
548 		return B_BAD_VALUE;
549 
550 	net_socket *accepted;
551 	status_t status = socket->first_info->accept(socket->first_protocol,
552 		&accepted);
553 	if (status < B_OK)
554 		return status;
555 
556 	if (address && *_addressLength > 0) {
557 		memcpy(address, &accepted->peer, min_c(*_addressLength,
558 			min_c(accepted->peer.ss_len, sizeof(sockaddr_storage))));
559 		*_addressLength = accepted->peer.ss_len;
560 	}
561 
562 	*_acceptedSocket = accepted;
563 	return B_OK;
564 }
565 
566 
567 int
568 socket_bind(net_socket *socket, const struct sockaddr *address, socklen_t addressLength)
569 {
570 	sockaddr empty;
571 	if (address == NULL) {
572 		// special - try to bind to an empty address, like INADDR_ANY
573 		memset(&empty, 0, sizeof(sockaddr));
574 		empty.sa_len = sizeof(sockaddr);
575 		empty.sa_family = socket->family;
576 
577 		address = &empty;
578 		addressLength = sizeof(sockaddr);
579 	}
580 
581 	if (socket->address.ss_len != 0) {
582 		status_t status = socket->first_info->unbind(socket->first_protocol,
583 			(sockaddr *)&socket->address);
584 		if (status < B_OK)
585 			return status;
586 	}
587 
588 	memcpy(&socket->address, address, sizeof(sockaddr));
589 
590 	status_t status = socket->first_info->bind(socket->first_protocol,
591 		(sockaddr *)address);
592 	if (status < B_OK) {
593 		// clear address again, as binding failed
594 		socket->address.ss_len = 0;
595 	}
596 
597 	return status;
598 }
599 
600 
601 int
602 socket_connect(net_socket *socket, const struct sockaddr *address, socklen_t addressLength)
603 {
604 	if (address == NULL || addressLength == 0)
605 		return ENETUNREACH;
606 
607 	if (socket->address.ss_len == 0) {
608 		// try to bind first
609 		status_t status = socket_bind(socket, NULL, 0);
610 		if (status < B_OK)
611 			return status;
612 	}
613 
614 	return socket->first_info->connect(socket->first_protocol, address);
615 }
616 
617 
618 int
619 socket_getpeername(net_socket *socket, struct sockaddr *address, socklen_t *_addressLength)
620 {
621 	if (socket->peer.ss_len == 0)
622 		return ENOTCONN;
623 
624 	memcpy(address, &socket->peer, min_c(*_addressLength, socket->peer.ss_len));
625 	*_addressLength = socket->peer.ss_len;
626 	return B_OK;
627 }
628 
629 
630 int
631 socket_getsockname(net_socket *socket, struct sockaddr *address, socklen_t *_addressLength)
632 {
633 	if (socket->address.ss_len == 0)
634 		return ENOTCONN;
635 
636 	memcpy(address, &socket->address, min_c(*_addressLength, socket->address.ss_len));
637 	*_addressLength = socket->address.ss_len;
638 	return B_OK;
639 }
640 
641 
642 int
643 socket_getsockopt(net_socket *socket, int level, int option, void *value,
644 	int *_length)
645 {
646 	if (level != SOL_SOCKET) {
647 		return socket->first_info->control(socket->first_protocol,
648 			level | LEVEL_GET_OPTION, option, value, (size_t *)_length);
649 	}
650 
651 	switch (option) {
652 		case SO_SNDBUF:
653 		{
654 			uint32 *size = (uint32 *)value;
655 			*size = socket->send.buffer_size;
656 			*_length = sizeof(uint32);
657 			return B_OK;
658 		}
659 
660 		case SO_RCVBUF:
661 		{
662 			uint32 *size = (uint32 *)value;
663 			*size = socket->receive.buffer_size;
664 			*_length = sizeof(uint32);
665 			return B_OK;
666 		}
667 
668 		case SO_SNDLOWAT:
669 		{
670 			uint32 *size = (uint32 *)value;
671 			*size = socket->send.low_water_mark;
672 			*_length = sizeof(uint32);
673 			return B_OK;
674 		}
675 
676 		case SO_RCVLOWAT:
677 		{
678 			uint32 *size = (uint32 *)value;
679 			*size = socket->receive.low_water_mark;
680 			*_length = sizeof(uint32);
681 			return B_OK;
682 		}
683 
684 		case SO_RCVTIMEO:
685 		case SO_SNDTIMEO:
686 		{
687 			if (*_length < (int)sizeof(struct timeval))
688 				return B_BAD_VALUE;
689 
690 			bigtime_t timeout;
691 			if (option == SO_SNDTIMEO)
692 				timeout = socket->send.timeout;
693 			else
694 				timeout = socket->receive.timeout;
695 			if (timeout == B_INFINITE_TIMEOUT)
696 				timeout = 0;
697 
698 			struct timeval *timeval = (struct timeval *)value;
699 			timeval->tv_sec = timeout / 1000000LL;
700 			timeval->tv_usec = timeout % 1000000LL;
701 
702 			*_length = sizeof(struct timeval);
703 			return B_OK;
704 		}
705 
706 		case SO_NONBLOCK:
707 		{
708 			int32 *_set = (int32 *)value;
709 			*_set = socket->receive.timeout == 0 && socket->send.timeout == 0;
710 			*_length = sizeof(int32);
711 			return B_OK;
712 		}
713 
714 		case SO_ACCEPTCONN:
715 		case SO_BROADCAST:
716 		case SO_DEBUG:
717 		case SO_DONTROUTE:
718 		case SO_KEEPALIVE:
719 		case SO_OOBINLINE:
720 		case SO_REUSEADDR:
721 		case SO_REUSEPORT:
722 		case SO_USELOOPBACK:
723 		{
724 			int32 *_set = (int32 *)value;
725 			*_set = (socket->options & option) != 0;
726 			*_length = sizeof(int32);
727 			return B_OK;
728 		}
729 
730 		case SO_ERROR:
731 		{
732 			int32 *_set = (int32 *)value;
733 			*_set = socket->error;
734 			*_length = sizeof(int32);
735 
736 			socket->error = B_OK;
737 				// clear error upon retrieval
738 			return B_OK;
739 		}
740 
741 		default:
742 			break;
743 	}
744 
745 	dprintf("socket_getsockopt: unknown option %d\n", option);
746 	return ENOPROTOOPT;
747 }
748 
749 
750 int
751 socket_listen(net_socket *socket, int backlog)
752 {
753 	status_t status = socket->first_info->listen(socket->first_protocol, backlog);
754 	if (status == B_OK)
755 		socket->options |= SO_ACCEPTCONN;
756 
757 	return status;
758 }
759 
760 
761 ssize_t
762 socket_receive(net_socket *socket, msghdr *header, void *data, size_t length,
763 	int flags)
764 {
765 	size_t totalLength = length;
766 	net_buffer *buffer;
767 	iovec tmp;
768 	int i;
769 
770 	// the convention to this function is that have header been
771 	// present, { data, length } would have been iovec[0] and is
772 	// always considered like that
773 
774 	if (header) {
775 		// calculate the length considering all of the extra buffers
776 		for (i = 1; i < header->msg_iovlen; i++) {
777 			if (user_memcpy(&tmp, header->msg_iov + i, sizeof(iovec)) < B_OK)
778 				return B_BAD_ADDRESS;
779 			if (tmp.iov_len > 0 && tmp.iov_base == NULL)
780 				return B_BAD_ADDRESS;
781 			totalLength += tmp.iov_len;
782 		}
783 	}
784 
785 	status_t status = socket->first_info->read_data(
786 		socket->first_protocol, totalLength, flags, &buffer);
787 	if (status < B_OK)
788 		return status;
789 
790 	// TODO: - returning a NULL buffer when received 0 bytes
791 	//         may not make much sense as we still need the address
792 	//       - gNetBufferModule.read() uses memcpy() instead of user_memcpy
793 
794 	size_t nameLen = 0;
795 
796 	if (header) {
797 		// TODO: - consider the control buffer options
798 		nameLen = header->msg_namelen;
799 		header->msg_namelen = 0;
800 		header->msg_flags = 0;
801 	}
802 
803 	if (buffer == NULL)
804 		return 0;
805 
806 	size_t bytesReceived = buffer->size, bytesCopied = 0;
807 
808 	length = min_c(bytesReceived, length);
809 	if (gNetBufferModule.read(buffer, 0, data, length) < B_OK) {
810 		gNetBufferModule.free(buffer);
811 		return ENOBUFS;
812 	}
813 
814 	// if first copy was a success, proceed to following
815 	// copies as required
816 	bytesCopied += length;
817 
818 	if (header) {
819 		// we only start considering at iovec[1]
820 		// as { data, length } is iovec[0]
821 		for (i = 1; i < header->msg_iovlen && bytesCopied < bytesReceived; i++) {
822 			if (user_memcpy(&tmp, header->msg_iov + i, sizeof(iovec)) < B_OK)
823 				break;
824 
825 			size_t toRead = min_c(bytesReceived - bytesCopied, tmp.iov_len);
826 			if (gNetBufferModule.read(buffer, bytesCopied, tmp.iov_base,
827 										toRead) < B_OK)
828 				break;
829 
830 			bytesCopied += toRead;
831 		}
832 
833 		if (header->msg_name != NULL) {
834 			header->msg_namelen = min_c(nameLen, buffer->source.ss_len);
835 			memcpy(header->msg_name, &buffer->source, header->msg_namelen);
836 		}
837 	}
838 
839 	gNetBufferModule.free(buffer);
840 
841 	if (bytesCopied < bytesReceived) {
842 		if (header)
843 			header->msg_flags = MSG_TRUNC;
844 
845 		if (flags & MSG_TRUNC)
846 			return bytesReceived;
847 	}
848 
849 	return bytesCopied;
850 }
851 
852 
853 ssize_t
854 socket_send(net_socket *socket, msghdr *header, const void *data,
855 	size_t length, int flags)
856 {
857 	const sockaddr *address = NULL;
858 	socklen_t addressLength = 0;
859 
860 	// the convention to this function is that have header been
861 	// present, { data, length } would have been iovec[0] and is
862 	// always considered like that
863 
864 	if (header != NULL) {
865 		address = (const sockaddr *)header->msg_name;
866 		addressLength = header->msg_namelen;
867 
868 		if (header->msg_iovlen <= 1)
869 			header = NULL;
870 	}
871 
872 	if (addressLength == 0)
873 		address = NULL;
874 	else if (addressLength != 0 && address == NULL)
875 		return B_BAD_VALUE;
876 
877 	if (socket->peer.ss_len != 0) {
878 		if (address != NULL)
879 			return EISCONN;
880 
881 		// socket is connected, we use that address
882 		address = (struct sockaddr *)&socket->peer;
883 		addressLength = socket->peer.ss_len;
884 	}
885 
886 	if (address == NULL || addressLength == 0) {
887 		// don't know where to send to:
888 		return EDESTADDRREQ;
889 	}
890 
891 	if (socket->address.ss_len == 0) {
892 		// try to bind first
893 		status_t status = socket_bind(socket, NULL, 0);
894 		if (status < B_OK)
895 			return status;
896 	}
897 
898 	// TODO: useful, maybe even computed header space!
899 	net_buffer *buffer = gNetBufferModule.create(256);
900 	if (buffer == NULL)
901 		return ENOBUFS;
902 
903 	if (gNetBufferModule.append(buffer, data, length) < B_OK) {
904 		gNetBufferModule.free(buffer);
905 		return ENOBUFS;
906 	}
907 
908 	if (header) {
909 		// copy additional data into buffer
910 		for (int i = 1; i < header->msg_iovlen; i++) {
911 			iovec tmp;
912 			if (user_memcpy(&tmp, header->msg_iov + i, sizeof(iovec)) < B_OK ||
913 				gNetBufferModule.append(buffer, tmp.iov_base, tmp.iov_len) < B_OK) {
914 				gNetBufferModule.free(buffer);
915 				return ENOBUFS;
916 			}
917 
918 			length += tmp.iov_len;
919 		}
920 	}
921 
922 	buffer->flags = flags;
923 	memcpy(&buffer->source, &socket->address, socket->address.ss_len);
924 	memcpy(&buffer->destination, address, addressLength);
925 
926 	status_t status = socket->first_info->send_data(socket->first_protocol,
927 		buffer);
928 	if (status < B_OK) {
929 		size_t size = buffer->size;
930 		gNetBufferModule.free(buffer);
931 
932 		if (size != length && (status == B_INTERRUPTED || status == B_WOULD_BLOCK)) {
933 			// this appears to be a partial write
934 			return length - size;
935 		}
936 		return status;
937 	}
938 
939 	return length;
940 }
941 
942 
943 int
944 socket_setsockopt(net_socket *socket, int level, int option, const void *value,
945 	int length)
946 {
947 	if (level != SOL_SOCKET) {
948 		return socket->first_info->control(socket->first_protocol,
949 			level | LEVEL_SET_OPTION, option, (void *)value, (size_t *)&length);
950 	}
951 
952 	switch (option) {
953 		// TODO: implement other options!
954 		case SO_LINGER:
955 		{
956 			if (length < (int)sizeof(struct linger))
957 				return B_BAD_VALUE;
958 
959 			struct linger *linger = (struct linger *)value;
960 			if (linger->l_onoff) {
961 				socket->options |= SO_LINGER;
962 				socket->linger = linger->l_linger;
963 			} else {
964 				socket->options &= ~SO_LINGER;
965 				socket->linger = 0;
966 			}
967 			return B_OK;
968 		}
969 
970 		case SO_SNDBUF:
971 			// TODO: should be handled in the protocol modules - they can actually
972 			//	check if setting the value is allowed and within valid bounds.
973 			if (length != sizeof(uint32))
974 				return B_BAD_VALUE;
975 
976 			socket->send.buffer_size = *(const uint32 *)value;
977 			return B_OK;
978 
979 		case SO_RCVBUF:
980 			// TODO: see above (SO_SNDBUF)
981 			if (length != sizeof(uint32))
982 				return B_BAD_VALUE;
983 
984 			socket->receive.buffer_size = *(const uint32 *)value;
985 			return B_OK;
986 
987 		case SO_SNDLOWAT:
988 			// TODO: see above (SO_SNDBUF)
989 			if (length != sizeof(uint32))
990 				return B_BAD_VALUE;
991 
992 			socket->send.low_water_mark = *(const uint32 *)value;
993 			return B_OK;
994 
995 		case SO_RCVLOWAT:
996 			// TODO: see above (SO_SNDBUF)
997 			if (length != sizeof(uint32))
998 				return B_BAD_VALUE;
999 
1000 			socket->receive.low_water_mark = *(const uint32 *)value;
1001 			return B_OK;
1002 
1003 		case SO_RCVTIMEO:
1004 		case SO_SNDTIMEO:
1005 		{
1006 			if (length != sizeof(struct timeval))
1007 				return B_BAD_VALUE;
1008 
1009 			const struct timeval *timeval = (const struct timeval *)value;
1010 			bigtime_t timeout = timeval->tv_sec * 1000000LL + timeval->tv_usec;
1011 			if (timeout == 0)
1012 				timeout = B_INFINITE_TIMEOUT;
1013 
1014 			if (option == SO_SNDTIMEO)
1015 				socket->send.timeout = timeout;
1016 			else
1017 				socket->receive.timeout = timeout;
1018 			return B_OK;
1019 		}
1020 
1021 		case SO_NONBLOCK:
1022 			if (length != sizeof(int32))
1023 				return B_BAD_VALUE;
1024 
1025 			if (*(const int32 *)value) {
1026 				socket->send.timeout = 0;
1027 				socket->receive.timeout = 0;
1028 			} else {
1029 				socket->send.timeout = B_INFINITE_TIMEOUT;
1030 				socket->receive.timeout = B_INFINITE_TIMEOUT;
1031 			}
1032 			return B_OK;
1033 
1034 		case SO_BROADCAST:
1035 		case SO_DEBUG:
1036 		case SO_DONTROUTE:
1037 		case SO_KEEPALIVE:
1038 		case SO_OOBINLINE:
1039 		case SO_REUSEADDR:
1040 		case SO_REUSEPORT:
1041 		case SO_USELOOPBACK:
1042 			if (length != sizeof(int32))
1043 				return B_BAD_VALUE;
1044 
1045 			if (*(const int32 *)value)
1046 				socket->options |= option;
1047 			else
1048 				socket->options &= ~option;
1049 			return B_OK;
1050 
1051 		default:
1052 			break;
1053 	}
1054 
1055 	dprintf("socket_setsockopt: unknown option %d\n", option);
1056 	return ENOPROTOOPT;
1057 }
1058 
1059 
1060 int
1061 socket_shutdown(net_socket *socket, int direction)
1062 {
1063 	return socket->first_info->shutdown(socket->first_protocol, direction);
1064 }
1065 
1066 
1067 //	#pragma mark -
1068 
1069 
1070 static status_t
1071 socket_std_ops(int32 op, ...)
1072 {
1073 	switch (op) {
1074 		case B_MODULE_INIT:
1075 		{
1076 			// TODO: this is currently done in the net_stack driver
1077 			// initialize the main stack if not done so already
1078 			//module_info *module;
1079 			//return get_module(NET_STARTER_MODULE_NAME, &module);
1080 			list_init_etc(&sSocketList, offsetof(net_socket_private, link));
1081 			return benaphore_init(&sSocketLock, "socket list");
1082 		}
1083 		case B_MODULE_UNINIT:
1084 			//return put_module(NET_STARTER_MODULE_NAME);
1085 			benaphore_destroy(&sSocketLock);
1086 			return B_OK;
1087 
1088 		default:
1089 			return B_ERROR;
1090 	}
1091 }
1092 
1093 
1094 net_socket_module_info gNetSocketModule = {
1095 	{
1096 		NET_SOCKET_MODULE_NAME,
1097 		0,
1098 		socket_std_ops
1099 	},
1100 	socket_open,
1101 	socket_close,
1102 	socket_free,
1103 
1104 	socket_readv,
1105 	socket_writev,
1106 	socket_control,
1107 
1108 	socket_read_avail,
1109 	socket_send_avail,
1110 
1111 	socket_send_data,
1112 	socket_receive_data,
1113 
1114 	socket_get_next_stat,
1115 
1116 	// connections
1117 	socket_spawn_pending,
1118 	socket_delete,
1119 	socket_dequeue_connected,
1120 	socket_count_connected,
1121 	socket_set_max_backlog,
1122 	socket_connected,
1123 
1124 	// notifications
1125 	socket_request_notification,
1126 	socket_cancel_notification,
1127 	socket_notify,
1128 
1129 	// standard socket API
1130 	socket_accept,
1131 	socket_bind,
1132 	socket_connect,
1133 	socket_getpeername,
1134 	socket_getsockname,
1135 	socket_getsockopt,
1136 	socket_listen,
1137 	socket_receive,
1138 	socket_send,
1139 	socket_setsockopt,
1140 	socket_shutdown,
1141 };
1142 
1143