xref: /haiku/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp (revision 89755088d790ff4fe36f8aa77dacb2bd15507108)
1 /*
2  * Copyright 2006-2008, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Andrew Galante, haiku.galante@gmail.com
7  *		Axel Dörfler, axeld@pinc-software.de
8  *		Hugo Santos, hugosantos@gmail.com
9  */
10 
11 
12 #include "TCPEndpoint.h"
13 #include "EndpointManager.h"
14 
15 #include <net_buffer.h>
16 #include <net_datalink.h>
17 #include <net_stat.h>
18 #include <NetBufferUtilities.h>
19 #include <NetUtilities.h>
20 
21 #include <lock.h>
22 #include <util/AutoLock.h>
23 #include <util/khash.h>
24 #include <util/list.h>
25 
26 #include <KernelExport.h>
27 #include <Select.h>
28 
29 #include <netinet/in.h>
30 #include <netinet/ip.h>
31 #include <netinet/tcp.h>
32 #include <new>
33 #include <stdlib.h>
34 #include <string.h>
35 
36 
37 // References:
38 //  - RFC 793 - Transmission Control Protocol
39 //  - RFC 813 - Window and Acknowledgement Strategy in TCP
40 //
41 // Things this implementation currently doesn't implement:
42 //
43 // TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery, RFC 2001, RFC 2581, RFC 3042
44 // NewReno Modification to TCP's Fast Recovery, RFC 2582
45 // Explicit Congestion Notification (ECN), RFC 3168
46 // SYN-Cache
47 // TCP Extensions for High Performance, RFC 1323
48 // SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517
49 // Forward RTO-Recovery, RFC 4138
50 
51 #define PrintAddress(address) \
52 	AddressString(Domain(), address, true).Data()
53 
54 //#define TRACE_TCP
55 //#define PROBE_TCP
56 
57 #ifdef TRACE_TCP
58 // the space before ', ##args' is important in order for this to work with cpp 2.95
59 #	define TRACE(format, args...)	dprintf("TCP [%llu] %p (%12s) " format "\n", \
60 		system_time(), this, name_for_state(fState) , ##args)
61 #else
62 #	define TRACE(args...)			do { } while (0)
63 #endif
64 
65 #ifdef PROBE_TCP
66 #	define PROBE(buffer, window) \
67 	dprintf("TCP PROBE %llu %s %s %ld snxt %lu suna %lu cw %lu sst %lu win %lu swin %lu smax-suna %lu savail %lu sqused %lu rto %llu\n", \
68 		system_time(), PrintAddress(buffer->source), \
69 		PrintAddress(buffer->destination), buffer->size, (uint32)fSendNext, \
70 		(uint32)fSendUnacknowledged, fCongestionWindow, fSlowStartThreshold, \
71 		window, fSendWindow, (uint32)(fSendMax - fSendUnacknowledged), \
72 		fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout)
73 #else
74 #	define PROBE(buffer, window)	do { } while (0)
75 #endif
76 
77 // Initial estimate for packet round trip time (RTT)
78 #define TCP_INITIAL_RTT		2000000
79 
80 // constants for the fFlags field
81 enum {
82 	FLAG_OPTION_WINDOW_SCALE	= 0x01,
83 	FLAG_OPTION_TIMESTAMP		= 0x02,
84 	// TODO: Should FLAG_NO_RECEIVE apply as well to received connections?
85 	//       That is, what is expected from accept() after a shutdown()
86 	//       is performed on a listen()ing socket.
87 	FLAG_NO_RECEIVE				= 0x04,
88 };
89 
90 
91 static const int kTimestampFactor = 1024;
92 
93 
94 static inline bigtime_t
95 absolute_timeout(bigtime_t timeout)
96 {
97 	if (timeout == 0 || timeout == B_INFINITE_TIMEOUT)
98 		return timeout;
99 
100 	return timeout + system_time();
101 }
102 
103 
104 static inline status_t
105 posix_error(status_t error)
106 {
107 	if (error == B_TIMED_OUT)
108 		return B_WOULD_BLOCK;
109 
110 	return error;
111 }
112 
113 
114 static inline bool
115 in_window(const tcp_sequence &sequence, const tcp_sequence &rcvNext,
116 	uint32 rcvWindow)
117 {
118 	return sequence >= rcvNext && sequence < (rcvNext + rcvWindow);
119 }
120 
121 
122 static inline bool
123 segment_in_sequence(const tcp_segment_header &segment, int size,
124 	const tcp_sequence &rcvNext, uint32 rcvWindow)
125 {
126 	tcp_sequence sequence(segment.sequence);
127 	if (size == 0) {
128 		if (rcvWindow == 0)
129 			return sequence == rcvNext;
130 		return in_window(sequence, rcvNext, rcvWindow);
131 	} else {
132 		if (rcvWindow == 0)
133 			return false;
134 		return in_window(sequence, rcvNext, rcvWindow)
135 			|| in_window(sequence + size - 1, rcvNext, rcvWindow);
136 	}
137 }
138 
139 
140 static inline bool
141 is_writable(tcp_state state)
142 {
143 	return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED
144 		|| state == ESTABLISHED || state == FINISH_RECEIVED;
145 }
146 
147 
148 static inline uint32 tcp_now()
149 {
150 	return system_time() / kTimestampFactor;
151 }
152 
153 
154 static inline uint32 tcp_diff_timestamp(uint32 base)
155 {
156 	uint32 now = tcp_now();
157 
158 	if (now > base)
159 		return now - base;
160 
161 	return now + UINT_MAX - base;
162 }
163 
164 
165 static inline bool
166 state_needs_finish(int32 state)
167 {
168 	return state == WAIT_FOR_FINISH_ACKNOWLEDGE
169 		|| state == FINISH_SENT || state == CLOSING;
170 }
171 
172 
173 //	#pragma mark -
174 
175 
176 WaitList::WaitList(const char *name)
177 {
178 	fCondition = 0;
179 	fSem = create_sem(0, name);
180 }
181 
182 
183 WaitList::~WaitList()
184 {
185 	delete_sem(fSem);
186 }
187 
188 
189 status_t
190 WaitList::InitCheck() const
191 {
192 	return fSem;
193 }
194 
195 
196 status_t
197 WaitList::Wait(MutexLocker &locker, bigtime_t timeout, bool wakeNext)
198 {
199 	locker.Unlock();
200 
201 	status_t status = B_OK;
202 
203 	while (status == B_OK && !atomic_test_and_set(&fCondition, 0, 1)) {
204 		status = acquire_sem_etc(fSem, 1, B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT,
205 			timeout);
206 	}
207 
208 	locker.Lock();
209 	if (status == B_OK && wakeNext)
210 		Signal();
211 
212 	return status;
213 }
214 
215 
216 void
217 WaitList::Signal()
218 {
219 	atomic_or(&fCondition, 1);
220 #ifdef __HAIKU__
221 	release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE | B_RELEASE_IF_WAITING_ONLY);
222 #else
223 	int32 count;
224 	if (get_sem_count(fSem, &count) == B_OK && count < 0)
225 		release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE);
226 #endif
227 }
228 
229 
230 //	#pragma mark -
231 
232 
233 TCPEndpoint::TCPEndpoint(net_socket *socket)
234 	:
235 	ProtocolSocket(socket),
236 	fManager(NULL),
237 	fReceiveList("tcp receive"),
238 	fSendList("tcp send"),
239 	fOptions(0),
240 	fSendWindowShift(0),
241 	fReceiveWindowShift(0),
242 	fSendUnacknowledged(0),
243 	fSendNext(0),
244 	fSendMax(0),
245 	fSendWindow(0),
246 	fSendMaxWindow(0),
247 	fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
248 	fSendQueue(socket->send.buffer_size),
249 	fInitialSendSequence(0),
250 	fDuplicateAcknowledgeCount(0),
251 	fRoute(NULL),
252 	fReceiveNext(0),
253 	fReceiveMaxAdvertised(0),
254 	fReceiveWindow(socket->receive.buffer_size),
255 	fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
256 	fReceiveQueue(socket->receive.buffer_size),
257 	fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor),
258 	fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor),
259 	fRetransmitTimeout(TCP_INITIAL_RTT),
260 	fReceivedTimestamp(0),
261 	fCongestionWindow(0),
262 	fSlowStartThreshold(0),
263 	fState(CLOSED),
264 	fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP)
265 {
266 	//gStackModule->init_timer(&fTimer, _TimeWait, this);
267 
268 	// TODO: to be replaced with a real locking strategy!
269 	mutex_init(&fLock, "tcp lock");
270 
271 	gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this);
272 	gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer, this);
273 	gStackModule->init_timer(&fDelayedAcknowledgeTimer,
274 		TCPEndpoint::_DelayedAcknowledgeTimer, this);
275 	gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer, this);
276 }
277 
278 
279 TCPEndpoint::~TCPEndpoint()
280 {
281 	mutex_lock(&fLock);
282 
283 	gStackModule->cancel_timer(&fRetransmitTimer);
284 	gStackModule->cancel_timer(&fPersistTimer);
285 	gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
286 	gStackModule->cancel_timer(&fTimeWaitTimer);
287 
288 	if (fManager) {
289 		fManager->Unbind(this);
290 		return_endpoint_manager(fManager);
291 	}
292 
293 	mutex_destroy(&fLock);
294 }
295 
296 
297 status_t
298 TCPEndpoint::InitCheck() const
299 {
300 	if (fLock.sem < B_OK)
301 		return fLock.sem;
302 
303 	if (fReceiveList.InitCheck() < B_OK)
304 		return fReceiveList.InitCheck();
305 
306 	if (fSendList.InitCheck() < B_OK)
307 		return fSendList.InitCheck();
308 
309 	return B_OK;
310 }
311 
312 
313 //	#pragma mark - protocol API
314 
315 
316 status_t
317 TCPEndpoint::Open()
318 {
319 	TRACE("Open()");
320 
321 	status_t status = ProtocolSocket::Open();
322 	if (status < B_OK)
323 		return status;
324 
325 	fManager = create_endpoint_manager(Domain());
326 	if (fManager == NULL)
327 		return EAFNOSUPPORT;
328 
329 	return B_OK;
330 }
331 
332 
333 status_t
334 TCPEndpoint::Close()
335 {
336 	TRACE("Close()");
337 
338 	MutexLocker lock(fLock);
339 
340 	if (fState == LISTEN)
341 		delete_sem(fAcceptSemaphore);
342 
343 	if (fState == SYNCHRONIZE_SENT || fState == LISTEN) {
344 		fState = CLOSED;
345 		return B_OK;
346 	}
347 
348 	status_t status = _ShutdownEgress(true);
349 	if (status != B_OK)
350 		return status;
351 
352 	if (socket->options & SO_LINGER) {
353 		TRACE("Close(): Lingering for %i secs", socket->linger);
354 
355 		bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL);
356 
357 		while (fSendQueue.Used() > 0) {
358 			status = fSendList.Wait(lock, maximum);
359 			if (status == B_TIMED_OUT || status == B_WOULD_BLOCK)
360 				break;
361 			else if (status < B_OK)
362 				return status;
363 		}
364 
365 		TRACE("Close(): after waiting, the SendQ was left with %lu bytes.",
366 			fSendQueue.Used());
367 	}
368 
369 	// TODO: do i need to wait until fState returns to CLOSED?
370 	return B_OK;
371 }
372 
373 
374 status_t
375 TCPEndpoint::Free()
376 {
377 	TRACE("Free()");
378 
379 	MutexLocker _(fLock);
380 
381 	if (fState <= SYNCHRONIZE_SENT || fState == TIME_WAIT)
382 		return B_OK;
383 
384 	// we are only interested in the timer, not in changing state
385 	_EnterTimeWait();
386 	return B_BUSY;
387 		// we'll be freed later when the 2MSL timer expires
388 }
389 
390 
391 /*!
392 	Creates and sends a synchronize packet to /a address, and then waits
393 	until the connection has been established or refused.
394 */
395 status_t
396 TCPEndpoint::Connect(const sockaddr *address)
397 {
398 	TRACE("Connect() on address %s", PrintAddress(address));
399 
400 	MutexLocker locker(fLock);
401 
402 	if (gStackModule->is_restarted_syscall()) {
403 		bigtime_t timeout = gStackModule->restore_syscall_restart_timeout();
404 		status_t status = _WaitForEstablished(locker, timeout);
405 		TRACE("  Connect(): Connection complete: %s (timeout was %llu)",
406 			strerror(status), timeout);
407 		return posix_error(status);
408 	}
409 
410 	// Can only call connect() from CLOSED or LISTEN states
411 	// otherwise endpoint is considered already connected
412 	if (fState == LISTEN) {
413 		// this socket is about to connect; remove pending connections in the backlog
414 		gSocketModule->set_max_backlog(socket, 0);
415 	} else if (fState == ESTABLISHED) {
416 		return EISCONN;
417 	} else if (fState != CLOSED)
418 		return EINPROGRESS;
419 
420 	status_t status = _PrepareSendPath(address);
421 	if (status < B_OK)
422 		return status;
423 
424 	TRACE("  Connect(): starting 3-way handshake...");
425 
426 	fState = SYNCHRONIZE_SENT;
427 
428 	// send SYN
429 	status = _SendQueued();
430 	if (status != B_OK) {
431 		fState = CLOSED;
432 		return status;
433 	}
434 
435 	// If we are running over Loopback, after _SendQueued() returns we
436 	// may be in ESTABLISHED already.
437 	if (fState == ESTABLISHED) {
438 		TRACE("  Connect() completed after _SendQueued()");
439 		return B_OK;
440 	}
441 
442 	// wait until 3-way handshake is complete (if needed)
443 	bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT);
444 	if (timeout == 0) {
445 		// we're a non-blocking socket
446 		TRACE("  Connect() delayed, return EINPROGRESS");
447 		return EINPROGRESS;
448 	}
449 
450 	bigtime_t absoluteTimeout = absolute_timeout(timeout);
451 	gStackModule->store_syscall_restart_timeout(absoluteTimeout);
452 
453 	status = _WaitForEstablished(locker, absoluteTimeout);
454 	TRACE("  Connect(): Connection complete: %s (timeout was %llu)",
455 		strerror(status), timeout);
456 	return posix_error(status);
457 }
458 
459 
460 status_t
461 TCPEndpoint::Accept(struct net_socket **_acceptedSocket)
462 {
463 	TRACE("Accept()");
464 
465 	MutexLocker locker(fLock);
466 
467 	status_t status;
468 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
469 	if (gStackModule->is_restarted_syscall())
470 		timeout = gStackModule->restore_syscall_restart_timeout();
471 	else
472 		gStackModule->store_syscall_restart_timeout(timeout);
473 
474 	do {
475 		locker.Unlock();
476 
477 		status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT
478 			| B_CAN_INTERRUPT, timeout);
479 		if (status < B_OK)
480 			return status;
481 
482 		locker.Lock();
483 		status = gSocketModule->dequeue_connected(socket, _acceptedSocket);
484 #ifdef TRACE_TCP
485 		if (status == B_OK)
486 			TRACE("  Accept() returning %p", (*_acceptedSocket)->first_protocol);
487 #endif
488 	} while (status < B_OK);
489 
490 	return status;
491 }
492 
493 
494 status_t
495 TCPEndpoint::Bind(const sockaddr *address)
496 {
497 	if (address == NULL)
498 		return B_BAD_VALUE;
499 
500 	MutexLocker lock(fLock);
501 
502 	TRACE("Bind() on address %s", PrintAddress(address));
503 
504 	if (fState != CLOSED)
505 		return EISCONN;
506 
507 	return fManager->Bind(this, address);
508 }
509 
510 
511 status_t
512 TCPEndpoint::Unbind(struct sockaddr *address)
513 {
514 	TRACE("Unbind()");
515 
516 	MutexLocker lock(fLock);
517 	return fManager->Unbind(this);
518 }
519 
520 
521 status_t
522 TCPEndpoint::Listen(int count)
523 {
524 	TRACE("Listen()");
525 
526 	MutexLocker lock(fLock);
527 
528 	if (fState != CLOSED)
529 		return B_BAD_VALUE;
530 
531 	fAcceptSemaphore = create_sem(0, "tcp accept");
532 	if (fAcceptSemaphore < B_OK)
533 		return ENOBUFS;
534 
535 	status_t status = fManager->SetPassive(this);
536 	if (status < B_OK) {
537 		delete_sem(fAcceptSemaphore);
538 		fAcceptSemaphore = -1;
539 		return status;
540 	}
541 
542 	fState = LISTEN;
543 	return B_OK;
544 }
545 
546 
547 status_t
548 TCPEndpoint::Shutdown(int direction)
549 {
550 	TRACE("Shutdown(%i)", direction);
551 
552 	MutexLocker lock(fLock);
553 
554 	if (direction == SHUT_RD || direction == SHUT_RDWR)
555 		fFlags |= FLAG_NO_RECEIVE;
556 
557 	if (direction == SHUT_WR || direction == SHUT_RDWR)
558 		_ShutdownEgress(false);
559 
560 	return B_OK;
561 }
562 
563 
564 /*!
565 	Puts data contained in \a buffer into send buffer
566 */
567 status_t
568 TCPEndpoint::SendData(net_buffer *buffer)
569 {
570 	MutexLocker lock(fLock);
571 
572 	TRACE("SendData(buffer %p, size %lu, flags %lx) [total %lu bytes, has %lu]",
573 		  buffer, buffer->size, buffer->flags, fSendQueue.Size(),
574 		  fSendQueue.Free());
575 
576 	if (fState == CLOSED)
577 		return ENOTCONN;
578 	if (fState == LISTEN)
579 		return EDESTADDRREQ;
580 	if (fState == FINISH_SENT || fState == FINISH_ACKNOWLEDGED
581 		|| fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
582 		|| fState == TIME_WAIT) {
583 		// TODO: send SIGPIPE signal to app?
584 		return EPIPE;
585 	}
586 
587 	if (buffer->size > 0) {
588 		if (buffer->size > fSendQueue.Size())
589 			return ENOBUFS;
590 
591 		bigtime_t timeout = absolute_timeout(socket->send.timeout);
592 		if (gStackModule->is_restarted_syscall())
593 			timeout = gStackModule->restore_syscall_restart_timeout();
594 		else
595 			gStackModule->store_syscall_restart_timeout(timeout);
596 
597 		while (fSendQueue.Free() < buffer->size) {
598 			status_t status = fSendList.Wait(lock, timeout);
599 			if (status < B_OK) {
600 				TRACE("  SendData() returning %s (%d)",
601 					strerror(posix_error(status)), (int)posix_error(status));
602 				return posix_error(status);
603 			}
604 		}
605 
606 		fSendQueue.Add(buffer);
607 	}
608 
609 	TRACE("  SendData(): %lu bytes used.", fSendQueue.Used());
610 
611 	if (fState == ESTABLISHED || fState == FINISH_RECEIVED)
612 		_SendQueued();
613 
614 	return B_OK;
615 }
616 
617 
618 ssize_t
619 TCPEndpoint::SendAvailable()
620 {
621 	MutexLocker locker(fLock);
622 
623 	ssize_t available;
624 
625 	if (is_writable(fState))
626 		available = fSendQueue.Free();
627 	else
628 		available = EPIPE;
629 
630 	TRACE("SendAvailable(): %li", available);
631 	return available;
632 }
633 
634 
635 status_t
636 TCPEndpoint::FillStat(net_stat *stat)
637 {
638 	MutexLocker _(fLock);
639 
640 	strlcpy(stat->state, name_for_state(fState), sizeof(stat->state));
641 	stat->receive_queue_size = fReceiveQueue.Available();
642 	stat->send_queue_size = fSendQueue.Used();
643 
644 	return B_OK;
645 }
646 
647 
648 status_t
649 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
650 {
651 	TRACE("ReadData(%lu bytes, flags 0x%x)", numBytes, (unsigned int)flags);
652 
653 	MutexLocker locker(fLock);
654 
655 	*_buffer = NULL;
656 
657 	if (fState == CLOSED)
658 		return ENOTCONN;
659 
660 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
661 	if (gStackModule->is_restarted_syscall())
662 		timeout = gStackModule->restore_syscall_restart_timeout();
663 	else
664 		gStackModule->store_syscall_restart_timeout(timeout);
665 
666 	if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) {
667 		if (flags & MSG_DONTWAIT)
668 			return B_WOULD_BLOCK;
669 
670 		status_t status = _WaitForEstablished(locker, timeout);
671 		if (status < B_OK)
672 			return posix_error(status);
673 	}
674 
675 	size_t dataNeeded = socket->receive.low_water_mark;
676 
677 	// When MSG_WAITALL is set then the function should block
678 	// until the full amount of data can be returned.
679 	if (flags & MSG_WAITALL)
680 		dataNeeded = numBytes;
681 
682 	// TODO: add support for urgent data (MSG_OOB)
683 
684 	while (true) {
685 		if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
686 			|| fState == TIME_WAIT) {
687 			// ``Connection closing''.
688 			return B_OK;
689 		}
690 
691 		if (fReceiveQueue.Available() > 0) {
692 			if (fReceiveQueue.Available() >= dataNeeded
693 				|| (fReceiveQueue.PushedData() > 0
694 					&& fReceiveQueue.PushedData() >= fReceiveQueue.Available()))
695 				break;
696 		} else if (fState == FINISH_RECEIVED) {
697 			// ``If no text is awaiting delivery, the RECEIVE will
698 			//   get a Connection closing''.
699 			return B_OK;
700 		}
701 
702 		if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0)
703 			return B_WOULD_BLOCK;
704 
705 		status_t status = fReceiveList.Wait(locker, timeout, false);
706 		if (status < B_OK) {
707 			// The Open Group base specification mentions that EINTR should be
708 			// returned if the recv() is interrupted before _any data_ is
709 			// available. So we actually check if there is data, and if so,
710 			// push it to the user.
711 			if ((status == B_TIMED_OUT || status == B_INTERRUPTED)
712 				&& fReceiveQueue.Available() > 0)
713 				break;
714 
715 			return posix_error(status);
716 		}
717 	}
718 
719 	TRACE("  ReadData(): %lu are available.", fReceiveQueue.Available());
720 
721 	if (numBytes < fReceiveQueue.Available())
722 		fReceiveList.Signal();
723 
724 	bool clone = (flags & MSG_PEEK);
725 
726 	ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer);
727 
728 	TRACE("  ReadData(): %lu bytes kept.", fReceiveQueue.Available());
729 
730 	// if we are opening the window, check if we should send an ACK
731 	if (!clone)
732 		SendAcknowledge(false);
733 
734 	return receivedBytes;
735 }
736 
737 
738 ssize_t
739 TCPEndpoint::ReadAvailable()
740 {
741 	MutexLocker locker(fLock);
742 
743 	TRACE("ReadAvailable(): %li", _AvailableData());
744 
745 	return _AvailableData();
746 }
747 
748 
749 status_t
750 TCPEndpoint::SetSendBufferSize(size_t length)
751 {
752 	MutexLocker _(fLock);
753 	fSendQueue.SetMaxBytes(length);
754 	return B_OK;
755 }
756 
757 
758 status_t
759 TCPEndpoint::SetReceiveBufferSize(size_t length)
760 {
761 	MutexLocker _(fLock);
762 	fReceiveQueue.SetMaxBytes(length);
763 	return B_OK;
764 }
765 
766 
767 status_t
768 TCPEndpoint::SetOption(int option, const void *_value, int length)
769 {
770 	if (option != TCP_NODELAY)
771 		return B_BAD_VALUE;
772 
773 	if (length != sizeof(int))
774 		return B_BAD_VALUE;
775 
776 	const int *value = (const int *)_value;
777 
778 	MutexLocker _(fLock);
779 	if (*value)
780 		fOptions |= TCP_NODELAY;
781 	else
782 		fOptions &= ~TCP_NODELAY;
783 
784 	return B_OK;
785 }
786 
787 
788 //	#pragma mark - misc
789 
790 
791 bool
792 TCPEndpoint::IsBound() const
793 {
794 	return !LocalAddress().IsEmpty(true);
795 }
796 
797 
798 status_t
799 TCPEndpoint::DelayedAcknowledge()
800 {
801 	if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) {
802 		// timer was active, send an ACK now (with the exception above,
803 		// we send every other ACK)
804 		return SendAcknowledge(true);
805 	}
806 
807 	gStackModule->set_timer(&fDelayedAcknowledgeTimer,
808 		TCP_DELAYED_ACKNOWLEDGE_TIMEOUT);
809 	return B_OK;
810 }
811 
812 
813 status_t
814 TCPEndpoint::SendAcknowledge(bool force)
815 {
816 	return _SendQueued(force, 0);
817 }
818 
819 
820 void
821 TCPEndpoint::_StartPersistTimer()
822 {
823 	gStackModule->set_timer(&fPersistTimer, 1000000LL);
824 }
825 
826 
827 void
828 TCPEndpoint::_EnterTimeWait()
829 {
830 	TRACE("_EnterTimeWait()\n");
831 	gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1);
832 }
833 
834 
835 status_t
836 TCPEndpoint::UpdateTimeWait()
837 {
838 	return B_OK;
839 }
840 
841 
842 //	#pragma mark - receive
843 
844 
845 int32
846 TCPEndpoint::_ListenReceive(tcp_segment_header &segment, net_buffer *buffer)
847 {
848 	TRACE("ListenReceive()");
849 
850 	// Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state,
851 	// but the error behaviour differs
852 	if (segment.flags & TCP_FLAG_RESET)
853 		return DROP;
854 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
855 		return DROP | RESET;
856 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
857 		return DROP;
858 
859 	// TODO: drop broadcast/multicast
860 
861 	// spawn new endpoint for accept()
862 	net_socket *newSocket;
863 	if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK)
864 		return DROP;
865 
866 	return ((TCPEndpoint *)newSocket->first_protocol)->Spawn(this,
867 		segment, buffer);
868 }
869 
870 
871 int32
872 TCPEndpoint::Spawn(TCPEndpoint *parent, tcp_segment_header &segment,
873 	net_buffer *buffer)
874 {
875 	MutexLocker _(fLock);
876 
877 	// TODO error checking
878 	ProtocolSocket::Open();
879 
880 	fState = SYNCHRONIZE_RECEIVED;
881 	fManager = parent->fManager;
882 
883 	LocalAddress().SetTo(buffer->destination);
884 	PeerAddress().SetTo(buffer->source);
885 
886 	TRACE("Spawn()");
887 
888 	// TODO: proper error handling!
889 	if (fManager->BindChild(this) < B_OK)
890 		return DROP;
891 
892 	if (_PrepareSendPath(*PeerAddress()) < B_OK)
893 		return DROP;
894 
895 	fOptions = parent->fOptions;
896 	fAcceptSemaphore = parent->fAcceptSemaphore;
897 
898 	_PrepareReceivePath(segment);
899 
900 	// send SYN+ACK
901 	if (_SendQueued() < B_OK)
902 		return DROP;
903 
904 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
905 		// we handled this flag now, it must not be set for further processing
906 
907 	return _Receive(segment, buffer);
908 }
909 
910 
911 void
912 TCPEndpoint::DumpInternalState() const
913 {
914 	kprintf("Lock: { sem: %ld, holder: %ld }\n", fLock.sem, fLock.holder);
915 	kprintf("AcceptSem: %ld\n", fAcceptSemaphore);
916 	kprintf("Options: 0x%lx\n", (uint32)fOptions);
917 	kprintf("SendWindowShift: %lu\n", (uint32)fSendWindowShift);
918 	kprintf("ReceiveWindowShift: %lu\n", (uint32)fReceiveWindowShift);
919 	kprintf("SendUnacknowledged: %lu\n", (uint32)fSendUnacknowledged);
920 	kprintf("SendNext: %lu\n", (uint32)fSendNext);
921 	kprintf("SendMax: %lu\n", (uint32)fSendMax);
922 	kprintf("SendWindow: %lu\n", fSendWindow);
923 	kprintf("SendMaxWindow: %lu\n", fSendMaxWindow);
924 	kprintf("SendMaxSegmentSize: %lu\n", fSendMaxSegmentSize);
925 	kprintf("Send-Q: %lu / %lu\n", fSendQueue.Used(), fSendQueue.Size());
926 	kprintf("LastAcknowledgeSent: %lu\n", (uint32)fLastAcknowledgeSent);
927 	kprintf("InitialSendSequence: %lu\n", (uint32)fInitialSendSequence);
928 	kprintf("DuplicateAcknowledgeCount: %lu\n", fDuplicateAcknowledgeCount);
929 	kprintf("ReceiveNext: %lu\n", (uint32)fReceiveNext);
930 	kprintf("ReceiveMaxAdvertised: %lu\n", (uint32)fReceiveMaxAdvertised);
931 	kprintf("ReceiveWindow: %lu\n", (uint32)fReceiveWindow);
932 	kprintf("ReceiveMaxSegmentSize: %lu\n", (uint32)fReceiveMaxSegmentSize);
933 	kprintf("Recv-Q: %lu / %lu\n", fReceiveQueue.Available(),
934 		fReceiveQueue.Size());
935 	kprintf("InitialReceiveSequence: %lu\n", (uint32)fInitialReceiveSequence);
936 	kprintf("RoundTripTime: %ld (dev %ld)\n", fRoundTripTime,
937 		fRoundTripDeviation);
938 	kprintf("RetransmitTimeout: %llu\n", (uint64)fRetransmitTimeout);
939 	kprintf("CongestionWindow: %lu\n", fCongestionWindow);
940 	kprintf("SlowStartThreshold: %lu\n", fSlowStartThreshold);
941 	kprintf("State: %s\n", name_for_state(fState));
942 	kprintf("Flags: 0x%lx\n", fFlags);
943 }
944 
945 
946 void
947 TCPEndpoint::_HandleReset(status_t error)
948 {
949 	gStackModule->cancel_timer(&fRetransmitTimer);
950 
951 	socket->error = error;
952 	fState = CLOSED;
953 
954 	fSendList.Signal();
955 	_NotifyReader();
956 
957 	gSocketModule->notify(socket, B_SELECT_WRITE, error);
958 	gSocketModule->notify(socket, B_SELECT_ERROR, error);
959 }
960 
961 
962 int32
963 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment,
964 	net_buffer *buffer)
965 {
966 	TRACE("_SynchronizeSentReceive()");
967 
968 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
969 		&& (fInitialSendSequence >= segment.acknowledge
970 			|| fSendMax < segment.acknowledge))
971 		return DROP | RESET;
972 
973 	if (segment.flags & TCP_FLAG_RESET) {
974 		_HandleReset(ECONNREFUSED);
975 		return DROP;
976 	}
977 
978 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
979 		return DROP;
980 
981 	fSendUnacknowledged = segment.acknowledge;
982 	_PrepareReceivePath(segment);
983 
984 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
985 		_MarkEstablished();
986 	} else {
987 		// simultaneous open
988 		fState = SYNCHRONIZE_RECEIVED;
989 	}
990 
991 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
992 		// we handled this flag now, it must not be set for further processing
993 
994 	return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE;
995 }
996 
997 
998 int32
999 TCPEndpoint::SegmentReceived(tcp_segment_header &segment, net_buffer *buffer)
1000 {
1001 	MutexLocker locker(fLock);
1002 
1003 	TRACE("SegmentReceived(): buffer %p (%lu bytes) address %s to %s",
1004 		buffer, buffer->size, PrintAddress(buffer->source),
1005 		PrintAddress(buffer->destination));
1006 	TRACE("                   flags 0x%x, seq %lu, ack %lu, wnd %lu",
1007 		segment.flags, segment.sequence, segment.acknowledge,
1008 		(uint32)segment.advertised_window << fSendWindowShift);
1009 
1010 	int32 segmentAction = DROP;
1011 
1012 	switch (fState) {
1013 		case LISTEN:
1014 			segmentAction = _ListenReceive(segment, buffer);
1015 			break;
1016 
1017 		case SYNCHRONIZE_SENT:
1018 			segmentAction = _SynchronizeSentReceive(segment, buffer);
1019 			break;
1020 
1021 		case SYNCHRONIZE_RECEIVED:
1022 		case ESTABLISHED:
1023 		case FINISH_RECEIVED:
1024 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1025 		case FINISH_SENT:
1026 		case FINISH_ACKNOWLEDGED:
1027 		case CLOSING:
1028 		case TIME_WAIT:
1029 		case CLOSED:
1030 			segmentAction = _SegmentReceived(segment, buffer);
1031 			break;
1032 	}
1033 
1034 	// process acknowledge action as asked for by the *Receive() method
1035 	if (segmentAction & IMMEDIATE_ACKNOWLEDGE)
1036 		SendAcknowledge(true);
1037 	else if (segmentAction & ACKNOWLEDGE)
1038 		DelayedAcknowledge();
1039 
1040 	return segmentAction;
1041 }
1042 
1043 int32
1044 TCPEndpoint::_SegmentReceived(tcp_segment_header &segment, net_buffer *buffer)
1045 {
1046 	uint32 advertisedWindow = (uint32)segment.advertised_window
1047 		<< fSendWindowShift;
1048 
1049 	// First, handle the most common case for uni-directional data transfer
1050 	// (known as header prediction - the segment must not change the window,
1051 	// and must be the expected sequence, and contain no control flags)
1052 
1053 	if (fState == ESTABLISHED
1054 		&& segment.AcknowledgeOnly()
1055 		&& fReceiveNext == segment.sequence
1056 		&& advertisedWindow > 0 && advertisedWindow == fSendWindow
1057 		&& fSendNext == fSendMax) {
1058 		_UpdateTimestamps(segment, buffer->size);
1059 
1060 		if (buffer->size == 0) {
1061 			// this is a pure acknowledge segment - we're on the sending end
1062 			if (fSendUnacknowledged < segment.acknowledge
1063 				&& fSendMax >= segment.acknowledge) {
1064 				_Acknowledged(segment);
1065 				return DROP;
1066 			}
1067 		} else if (segment.acknowledge == fSendUnacknowledged
1068 			&& fReceiveQueue.IsContiguous()
1069 			&& fReceiveQueue.Free() >= buffer->size
1070 			&& !(fFlags & FLAG_NO_RECEIVE)) {
1071 			if (_AddData(segment, buffer))
1072 				_NotifyReader();
1073 
1074 			return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0
1075 				? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE);
1076 		}
1077 	}
1078 
1079 	// The fast path was not applicable, so we continue with the standard
1080 	// processing of the incoming segment
1081 
1082 	if (fState != SYNCHRONIZE_SENT && fState != LISTEN && fState != CLOSED) {
1083 		// 1. check sequence number
1084 		if (!segment_in_sequence(segment, buffer->size, fReceiveNext,
1085 				fReceiveWindow)) {
1086 			TRACE("  Receive(): segment out of window, next: %lu wnd: %lu",
1087 				(uint32)fReceiveNext, fReceiveWindow);
1088 			if (segment.flags & TCP_FLAG_RESET) {
1089 				// TODO: this doesn't look right - review!
1090 				return DROP;
1091 			}
1092 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1093 		}
1094 	}
1095 
1096 	return _Receive(segment, buffer);
1097 }
1098 
1099 
1100 //	#pragma mark - send
1101 
1102 
1103 inline uint8
1104 TCPEndpoint::_CurrentFlags()
1105 {
1106 	// we don't set FLAG_FINISH here, instead we do it
1107 	// conditionally below depending if we are sending
1108 	// the last bytes of the send queue.
1109 
1110 	switch (fState) {
1111 		case CLOSED:
1112 			return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE;
1113 
1114 		case SYNCHRONIZE_SENT:
1115 			return TCP_FLAG_SYNCHRONIZE;
1116 		case SYNCHRONIZE_RECEIVED:
1117 			return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE;
1118 
1119 		case ESTABLISHED:
1120 		case FINISH_RECEIVED:
1121 		case FINISH_ACKNOWLEDGED:
1122 		case TIME_WAIT:
1123 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1124 		case FINISH_SENT:
1125 		case CLOSING:
1126 			return TCP_FLAG_ACKNOWLEDGE;
1127 
1128 		default:
1129 			return 0;
1130 	}
1131 }
1132 
1133 
1134 inline bool
1135 TCPEndpoint::_ShouldSendSegment(tcp_segment_header &segment, uint32 length,
1136 	uint32 segmentMaxSize, uint32 flightSize)
1137 {
1138 	if (length > 0) {
1139 		// Avoid the silly window syndrome - we only send a segment in case:
1140 		// - we have a full segment to send, or
1141 		// - we're at the end of our buffer queue, or
1142 		// - the buffer is at least larger than half of the maximum send window, or
1143 		// - we're retransmitting data
1144 		if (length == segmentMaxSize
1145 			|| (fOptions & TCP_NODELAY) != 0
1146 			|| tcp_sequence(fSendNext + length) == fSendQueue.LastSequence()
1147 			|| (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2))
1148 			return true;
1149 	}
1150 
1151 	// check if we need to send a window update to the peer
1152 	if (segment.advertised_window > 0) {
1153 		// correct the window to take into account what already has been advertised
1154 		uint32 window = (segment.advertised_window << fReceiveWindowShift)
1155 			- (fReceiveMaxAdvertised - fReceiveNext);
1156 
1157 		// if we can advertise a window larger than twice the maximum segment
1158 		// size, or half the maximum buffer size we send a window update
1159 		if (window >= (fReceiveMaxSegmentSize << 1)
1160 			|| window >= (socket->receive.buffer_size >> 1))
1161 			return true;
1162 	}
1163 
1164 	if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH
1165 			| TCP_FLAG_RESET)) != 0)
1166 		return true;
1167 
1168 	// there is no reason to send a segment just now
1169 	return false;
1170 }
1171 
1172 
1173 status_t
1174 TCPEndpoint::_SendQueued(bool force)
1175 {
1176 	return _SendQueued(force, fSendWindow);
1177 }
1178 
1179 
1180 /*!
1181 	Sends one or more TCP segments with the data waiting in the queue, or some
1182 	specific flags that need to be sent.
1183 */
1184 status_t
1185 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
1186 {
1187 	if (fRoute == NULL)
1188 		return B_ERROR;
1189 
1190 	// in passive state?
1191 	if (fState == LISTEN)
1192 		return B_ERROR;
1193 
1194 	tcp_segment_header segment(_CurrentFlags());
1195 
1196 	if ((fOptions & TCP_NOOPT) == 0) {
1197 		if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) {
1198 			segment.options |= TCP_HAS_TIMESTAMPS;
1199 			segment.timestamp_reply = fReceivedTimestamp;
1200 			segment.timestamp_value = tcp_now();
1201 		}
1202 
1203 		if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1204 			&& fSendNext == fInitialSendSequence) {
1205 			// add connection establishment options
1206 			segment.max_segment_size = fReceiveMaxSegmentSize;
1207 			if (fFlags & FLAG_OPTION_WINDOW_SCALE) {
1208 				segment.options |= TCP_HAS_WINDOW_SCALE;
1209 				segment.window_shift = fReceiveWindowShift;
1210 			}
1211 		}
1212 	}
1213 
1214 	size_t availableBytes = fReceiveQueue.Free();
1215 	if (fFlags & FLAG_OPTION_WINDOW_SCALE)
1216 		segment.advertised_window = availableBytes >> fReceiveWindowShift;
1217 	else
1218 		segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes);
1219 
1220 	segment.acknowledge = fReceiveNext;
1221 	segment.urgent_offset = 0;
1222 
1223 	if (fCongestionWindow > 0 && fCongestionWindow < sendWindow)
1224 		sendWindow = fCongestionWindow;
1225 
1226 	// SND.UNA  SND.NXT        SND.MAX
1227 	//  |        |              |
1228 	//  v        v              v
1229 	//  -----------------------------------
1230 	//  | effective window           |
1231 	//  -----------------------------------
1232 
1233 	// Flight size represents the window of data which is currently in the
1234 	// ether. We should never send data such as the flight size becomes larger
1235 	// than the effective window. Note however that the effective window may be
1236 	// reduced (by congestion for instance), so at some point in time flight
1237 	// size may be larger than the currently calculated window.
1238 
1239 	uint32 flightSize = fSendMax - fSendUnacknowledged;
1240 	uint32 consumedWindow = fSendNext - fSendUnacknowledged;
1241 
1242 	if (consumedWindow > sendWindow) {
1243 		sendWindow = 0;
1244 		// TODO enter persist state? try to get a window update.
1245 	} else
1246 		sendWindow -= consumedWindow;
1247 
1248 	if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) {
1249 		// send one byte of data to ask for a window update
1250 		// (triggered by the persist timer)
1251 		sendWindow = 1;
1252 	}
1253 
1254 	uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow);
1255 	tcp_sequence previousSendNext = fSendNext;
1256 
1257 	do {
1258 		uint32 segmentMaxSize = fSendMaxSegmentSize
1259 			- tcp_options_length(segment);
1260 		uint32 segmentLength = min_c(length, segmentMaxSize);
1261 
1262 		if (fSendNext + segmentLength == fSendQueue.LastSequence()) {
1263 			if (state_needs_finish(fState))
1264 				segment.flags |= TCP_FLAG_FINISH;
1265 			if (length > 0)
1266 				segment.flags |= TCP_FLAG_PUSH;
1267 		}
1268 
1269 		// Determine if we should really send this segment
1270 		if (!force && !_ShouldSendSegment(segment, segmentLength,
1271 			segmentMaxSize, flightSize)) {
1272 			if (fSendQueue.Available()
1273 				&& !gStackModule->is_timer_active(&fPersistTimer)
1274 				&& !gStackModule->is_timer_active(&fRetransmitTimer))
1275 				_StartPersistTimer();
1276 			break;
1277 		}
1278 
1279 		net_buffer *buffer = gBufferModule->create(256);
1280 		if (buffer == NULL)
1281 			return B_NO_MEMORY;
1282 
1283 		status_t status = B_OK;
1284 		if (segmentLength > 0)
1285 			status = fSendQueue.Get(buffer, fSendNext, segmentLength);
1286 		if (status < B_OK) {
1287 			gBufferModule->free(buffer);
1288 			return status;
1289 		}
1290 
1291 		LocalAddress().CopyTo(buffer->source);
1292 		PeerAddress().CopyTo(buffer->destination);
1293 
1294 		uint32 size = buffer->size;
1295 		segment.sequence = fSendNext;
1296 
1297 		TRACE("SendQueued(): buffer %p (%lu bytes) address %s to %s",
1298 			buffer, buffer->size, PrintAddress(buffer->source),
1299 			PrintAddress(buffer->destination));
1300 		TRACE("              flags 0x%x, seq %lu, ack %lu, rwnd %hu, cwnd %lu"
1301 			", ssthresh %lu", segment.flags, segment.sequence,
1302 			segment.acknowledge, segment.advertised_window,
1303 			fCongestionWindow, fSlowStartThreshold);
1304 		TRACE("              len %lu first %lu last %lu", segmentLength,
1305 			(uint32)fSendQueue.FirstSequence(),
1306 			(uint32)fSendQueue.LastSequence());
1307 
1308 		PROBE(buffer, sendWindow);
1309 		sendWindow -= buffer->size;
1310 
1311 		status = add_tcp_header(AddressModule(), segment, buffer);
1312 		if (status != B_OK) {
1313 			gBufferModule->free(buffer);
1314 			return status;
1315 		}
1316 
1317 		// Update send status - we need to do this before we send the data
1318 		// for local connections as the answer is directly handled
1319 
1320 		if (segment.flags & TCP_FLAG_SYNCHRONIZE) {
1321 			segment.options &= ~TCP_HAS_WINDOW_SCALE;
1322 			segment.max_segment_size = 0;
1323 			size++;
1324 		}
1325 
1326 		if (segment.flags & TCP_FLAG_FINISH)
1327 			size++;
1328 
1329 		uint32 sendMax = fSendMax;
1330 		fSendNext += size;
1331 		if (fSendMax < fSendNext)
1332 			fSendMax = fSendNext;
1333 
1334 		fReceiveMaxAdvertised = fReceiveNext
1335 			+ ((uint32)segment.advertised_window << fReceiveWindowShift);
1336 
1337 		status = next->module->send_routed_data(next, fRoute, buffer);
1338 		if (status < B_OK) {
1339 			gBufferModule->free(buffer);
1340 
1341 			fSendNext = segment.sequence;
1342 			fSendMax = sendMax;
1343 				// restore send status
1344 			return status;
1345 		}
1346 
1347 		if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
1348 			fLastAcknowledgeSent = segment.acknowledge;
1349 
1350 		length -= segmentLength;
1351 		segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET
1352 			| TCP_FLAG_FINISH);
1353 	} while (length > 0);
1354 
1355 	// if we sent data from the beggining of the send queue,
1356 	// start the retransmition timer
1357 	if (previousSendNext == fSendUnacknowledged
1358 		&& fSendNext > previousSendNext) {
1359 		TRACE("  SendQueue(): set retransmit timer with rto %llu",
1360 			fRetransmitTimeout);
1361 
1362 		gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
1363 	}
1364 
1365 	return B_OK;
1366 }
1367 
1368 
1369 int
1370 TCPEndpoint::_MaxSegmentSize(const sockaddr *address) const
1371 {
1372 	return next->module->get_mtu(next, address) - sizeof(tcp_header);
1373 }
1374 
1375 
1376 status_t
1377 TCPEndpoint::_ShutdownEgress(bool closing)
1378 {
1379 	tcp_state previousState = fState;
1380 
1381 	if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED)
1382 		fState = FINISH_SENT;
1383 	else if (fState == FINISH_RECEIVED)
1384 		fState = WAIT_FOR_FINISH_ACKNOWLEDGE;
1385 	else
1386 		return B_OK;
1387 
1388 	status_t status = _SendQueued();
1389 	if (status != B_OK) {
1390 		fState = previousState;
1391 		return status;
1392 	}
1393 
1394 	return B_OK;
1395 }
1396 
1397 
1398 ssize_t
1399 TCPEndpoint::_AvailableData() const
1400 {
1401 	// TODO: Refer to the FLAG_NO_RECEIVE comment above regarding
1402 	//       the application of FLAG_NO_RECEIVE in listen()ing
1403 	//       sockets.
1404 	if (fState == LISTEN)
1405 		return gSocketModule->count_connected(socket);
1406 	if (fState == SYNCHRONIZE_SENT)
1407 		return 0;
1408 
1409 	ssize_t availableData = fReceiveQueue.Available();
1410 
1411 	if (availableData == 0 && !_ShouldReceive())
1412 		return ENOTCONN;
1413 
1414 	return availableData;
1415 }
1416 
1417 
1418 void
1419 TCPEndpoint::_NotifyReader()
1420 {
1421 	fReceiveList.Signal();
1422 	gSocketModule->notify(socket, B_SELECT_READ, _AvailableData());
1423 }
1424 
1425 
1426 bool
1427 TCPEndpoint::_ShouldReceive() const
1428 {
1429 	if (fFlags & FLAG_NO_RECEIVE)
1430 		return false;
1431 
1432 	return fState == ESTABLISHED || fState == FINISH_SENT
1433 		|| fState == FINISH_ACKNOWLEDGED;
1434 }
1435 
1436 
1437 int32
1438 TCPEndpoint::_Receive(tcp_segment_header &segment, net_buffer *buffer)
1439 {
1440 	uint32 advertisedWindow = (uint32)segment.advertised_window
1441 		<< fSendWindowShift;
1442 
1443 	size_t segmentLength = buffer->size;
1444 
1445 	if (segment.flags & TCP_FLAG_RESET) {
1446 		// is this a valid reset?
1447 		if (fLastAcknowledgeSent <= segment.sequence
1448 			&& tcp_sequence(segment.sequence) < (fLastAcknowledgeSent
1449 				+ fReceiveWindow)) {
1450 			status_t error;
1451 			if (fState == SYNCHRONIZE_RECEIVED)
1452 				error = ECONNREFUSED;
1453 			else if (fState == CLOSING || fState == TIME_WAIT
1454 				|| fState == WAIT_FOR_FINISH_ACKNOWLEDGE)
1455 				error = ENOTCONN;
1456 			else
1457 				error = ECONNRESET;
1458 
1459 			_HandleReset(error);
1460 		}
1461 
1462 		return DROP;
1463 	}
1464 
1465 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1466 		|| (fState == SYNCHRONIZE_RECEIVED
1467 			&& (fInitialReceiveSequence > segment.sequence
1468 				|| (segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1469 					&& (fSendUnacknowledged > segment.acknowledge
1470 						|| fSendMax < segment.acknowledge)))) {
1471 		// reset the connection - either the initial SYN was faulty, or we
1472 		// received a SYN within the data stream
1473 		return DROP | RESET;
1474 	}
1475 
1476 	fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow);
1477 		// the window must not shrink
1478 
1479 	// trim buffer to be within the receive window
1480 	int32 drop = fReceiveNext - segment.sequence;
1481 	if (drop > 0) {
1482 		if ((uint32)drop > buffer->size
1483 			|| ((uint32)drop == buffer->size
1484 				&& (segment.flags & TCP_FLAG_FINISH) == 0)) {
1485 			// don't accidently remove a FIN we shouldn't remove
1486 			segment.flags &= ~TCP_FLAG_FINISH;
1487 			drop = buffer->size;
1488 		}
1489 
1490 		// remove duplicate data at the start
1491 		TRACE("* remove %ld bytes from the start", drop);
1492 		gBufferModule->remove_header(buffer, drop);
1493 		segment.sequence += drop;
1494 	}
1495 
1496 	int32 action = KEEP;
1497 
1498 	drop = segment.sequence + buffer->size - (fReceiveNext + fReceiveWindow);
1499 	if (drop > 0) {
1500 		// remove data exceeding our window
1501 		if ((uint32)drop >= buffer->size) {
1502 			// if we can accept data, or the segment is not what we'd expect,
1503 			// drop the segment (an immediate acknowledge is always triggered)
1504 			if (fReceiveWindow != 0 || segment.sequence != fReceiveNext)
1505 				return DROP | IMMEDIATE_ACKNOWLEDGE;
1506 
1507 			action |= IMMEDIATE_ACKNOWLEDGE;
1508 		}
1509 
1510 		if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1511 			// we need to remove the finish, too, as part of the data
1512 			drop--;
1513 		}
1514 
1515 		segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH);
1516 		TRACE("* remove %ld bytes from the end", drop);
1517 		gBufferModule->remove_trailer(buffer, drop);
1518 	}
1519 
1520 #ifdef TRACE_TCP
1521 	if (advertisedWindow > fSendWindow) {
1522 		TRACE("  Receive(): Window update %lu -> %lu", fSendWindow,
1523 			advertisedWindow);
1524 	}
1525 #endif
1526 
1527 	fSendWindow = advertisedWindow;
1528 	if (advertisedWindow > fSendMaxWindow)
1529 		fSendMaxWindow = advertisedWindow;
1530 
1531 	// Then look at the acknowledgement for any updates
1532 
1533 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) {
1534 		// process acknowledged data
1535 		if (fState == SYNCHRONIZE_RECEIVED)
1536 			_MarkEstablished();
1537 
1538 		if (fSendMax < segment.acknowledge || fState == TIME_WAIT)
1539 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1540 
1541 		if (segment.acknowledge < fSendUnacknowledged) {
1542 			if (buffer->size == 0 && advertisedWindow == fSendWindow
1543 				&& (segment.flags & TCP_FLAG_FINISH) == 0) {
1544 				TRACE("Receive(): duplicate ack!");
1545 
1546 				_DuplicateAcknowledge(segment);
1547 			}
1548 
1549 			return DROP;
1550 		} else {
1551 			// this segment acknowledges in flight data
1552 
1553 			if (fDuplicateAcknowledgeCount >= 3) {
1554 				// deflate the window.
1555 				fCongestionWindow = fSlowStartThreshold;
1556 			}
1557 
1558 			fDuplicateAcknowledgeCount = 0;
1559 
1560 			if (fSendMax == segment.acknowledge)
1561 				TRACE("Receive(): all inflight data ack'd!");
1562 
1563 			if (segment.acknowledge > fSendQueue.LastSequence()
1564 					&& fState > ESTABLISHED) {
1565 				TRACE("Receive(): FIN has been acknowledged!");
1566 
1567 				switch (fState) {
1568 					case FINISH_SENT:
1569 						fState = FINISH_ACKNOWLEDGED;
1570 						break;
1571 					case CLOSING:
1572 						fState = TIME_WAIT;
1573 						_EnterTimeWait();
1574 						return DROP;
1575 					case WAIT_FOR_FINISH_ACKNOWLEDGE:
1576 						fState = CLOSED;
1577 						break;
1578 
1579 					default:
1580 						break;
1581 				}
1582 			}
1583 
1584 			if (fState != CLOSED)
1585 				_Acknowledged(segment);
1586 		}
1587 	}
1588 
1589 	if (segment.flags & TCP_FLAG_URGENT) {
1590 		if (fState == ESTABLISHED || fState == FINISH_SENT
1591 			|| fState == FINISH_ACKNOWLEDGED) {
1592 			// TODO: Handle urgent data:
1593 			//  - RCV.UP <- max(RCV.UP, SEG.UP)
1594 			//  - signal the user that urgent data is available (SIGURG)
1595 		}
1596 	}
1597 
1598 	bool notify = false;
1599 
1600 	if (buffer->size > 0 &&	_ShouldReceive())
1601 		notify = _AddData(segment, buffer);
1602 	else
1603 		action = (action & ~KEEP) | DROP;
1604 
1605 	if (segment.flags & TCP_FLAG_FINISH) {
1606 		segmentLength++;
1607 		if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) {
1608 			TRACE("Receive(): peer is finishing connection!");
1609 			fReceiveNext++;
1610 			notify = true;
1611 
1612 			// FIN implies PSH
1613 			fReceiveQueue.SetPushPointer();
1614 
1615 			// we'll reply immediatly to the FIN if we are not
1616 			// transitioning to TIME WAIT so we immediatly ACK it.
1617 			action |= IMMEDIATE_ACKNOWLEDGE;
1618 
1619 			// other side is closing connection; change states
1620 			switch (fState) {
1621 				case ESTABLISHED:
1622 				case SYNCHRONIZE_RECEIVED:
1623 					fState = FINISH_RECEIVED;
1624 					break;
1625 				case FINISH_SENT:
1626 					// simultaneous close
1627 					fState = CLOSING;
1628 					break;
1629 				case FINISH_ACKNOWLEDGED:
1630 					fState = TIME_WAIT;
1631 					_EnterTimeWait();
1632 					break;
1633 				default:
1634 					break;
1635 			}
1636 		}
1637 	}
1638 
1639 	if (notify)
1640 		_NotifyReader();
1641 
1642 	if (buffer->size > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)
1643 		action |= ACKNOWLEDGE;
1644 
1645 	_UpdateTimestamps(segment, segmentLength);
1646 
1647 	TRACE("Receive() Action %ld", action);
1648 
1649 	return action;
1650 }
1651 
1652 
1653 void
1654 TCPEndpoint::_UpdateTimestamps(tcp_segment_header &segment,
1655 	size_t segmentLength)
1656 {
1657 	if (fFlags & FLAG_OPTION_TIMESTAMP) {
1658 		tcp_sequence sequence(segment.sequence);
1659 
1660 		if (fLastAcknowledgeSent >= sequence
1661 			&& fLastAcknowledgeSent < (sequence + segmentLength))
1662 			fReceivedTimestamp = segment.timestamp_value;
1663 	}
1664 }
1665 
1666 
1667 void
1668 TCPEndpoint::_MarkEstablished()
1669 {
1670 	fState = ESTABLISHED;
1671 
1672 	if (socket->parent != NULL) {
1673 		gSocketModule->set_connected(socket);
1674 		release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE);
1675 	}
1676 
1677 	fSendList.Signal();
1678 }
1679 
1680 
1681 status_t
1682 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout)
1683 {
1684 	while (fState != ESTABLISHED) {
1685 		if (socket->error != B_OK)
1686 			return socket->error;
1687 
1688 		status_t status = fSendList.Wait(locker, timeout);
1689 		if (status < B_OK)
1690 			return status;
1691 	}
1692 
1693 	return B_OK;
1694 }
1695 
1696 
1697 bool
1698 TCPEndpoint::_AddData(tcp_segment_header &segment, net_buffer *buffer)
1699 {
1700 	fReceiveQueue.Add(buffer, segment.sequence);
1701 	fReceiveNext = fReceiveQueue.NextSequence();
1702 
1703 	TRACE("  _AddData(): adding data, receive next = %lu. Now have %lu bytes.",
1704 		(uint32)fReceiveNext, fReceiveQueue.Available());
1705 
1706 	if (segment.flags & TCP_FLAG_PUSH)
1707 		fReceiveQueue.SetPushPointer();
1708 
1709 	return fReceiveQueue.Available() > 0;
1710 }
1711 
1712 
1713 void
1714 TCPEndpoint::_PrepareReceivePath(tcp_segment_header &segment)
1715 {
1716 	fInitialReceiveSequence = segment.sequence;
1717 
1718 	// count the received SYN
1719 	segment.sequence++;
1720 
1721 	fReceiveNext = segment.sequence;
1722 	fReceiveQueue.SetInitialSequence(segment.sequence);
1723 
1724 	if ((fOptions & TCP_NOOPT) == 0) {
1725 		if (segment.max_segment_size > 0)
1726 			fSendMaxSegmentSize = segment.max_segment_size;
1727 
1728 		if (segment.options & TCP_HAS_WINDOW_SCALE) {
1729 			fFlags |= FLAG_OPTION_WINDOW_SCALE;
1730 			fSendWindowShift = segment.window_shift;
1731 		} else {
1732 			fFlags &= ~FLAG_OPTION_WINDOW_SCALE;
1733 			fReceiveWindowShift = 0;
1734 		}
1735 
1736 		if (segment.options & TCP_HAS_TIMESTAMPS) {
1737 			fFlags |= FLAG_OPTION_TIMESTAMP;
1738 			fReceivedTimestamp = segment.timestamp_value;
1739 		} else
1740 			fFlags &= ~FLAG_OPTION_TIMESTAMP;
1741 	}
1742 
1743 	fCongestionWindow = 2 * fSendMaxSegmentSize;
1744 	fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift;
1745 }
1746 
1747 
1748 status_t
1749 TCPEndpoint::_PrepareSendPath(const sockaddr *peer)
1750 {
1751 	if (fRoute == NULL) {
1752 		fRoute = gDatalinkModule->get_route(Domain(), peer);
1753 		if (fRoute == NULL)
1754 			return ENETUNREACH;
1755 	}
1756 
1757 	// make sure connection does not already exist
1758 	status_t status = fManager->SetConnection(this, *LocalAddress(), peer,
1759 		fRoute->interface->address);
1760 	if (status < B_OK)
1761 		return status;
1762 
1763 	fInitialSendSequence = system_time() >> 4;
1764 	fSendNext = fInitialSendSequence;
1765 	fSendUnacknowledged = fInitialSendSequence;
1766 	fSendMax = fInitialSendSequence;
1767 
1768 	// we are counting the SYN here
1769 	fSendQueue.SetInitialSequence(fSendNext + 1);
1770 
1771 	fReceiveMaxSegmentSize = _MaxSegmentSize(peer);
1772 
1773 	// Compute the window shift we advertise to our peer - if it doesn't support
1774 	// this option, this will be reset to 0 (when its SYN is received)
1775 	fReceiveWindowShift = 0;
1776 	while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT
1777 		&& (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) {
1778 		fReceiveWindowShift++;
1779 	}
1780 
1781 	return B_OK;
1782 }
1783 
1784 
1785 void
1786 TCPEndpoint::_Acknowledged(tcp_segment_header &segment)
1787 {
1788 	size_t previouslyUsed = fSendQueue.Used();
1789 
1790 	fSendQueue.RemoveUntil(segment.acknowledge);
1791 	fSendUnacknowledged = segment.acknowledge;
1792 
1793 	if (fSendNext < fSendUnacknowledged)
1794 		fSendNext = fSendUnacknowledged;
1795 
1796 	if (fSendUnacknowledged == fSendMax)
1797 		gStackModule->cancel_timer(&fRetransmitTimer);
1798 
1799 	if (fSendQueue.Used() < previouslyUsed) {
1800 		// this ACK acknowledged data
1801 
1802 		if (segment.options & TCP_HAS_TIMESTAMPS)
1803 			_UpdateSRTT(tcp_diff_timestamp(segment.timestamp_reply));
1804 		else {
1805 			// TODO: Fallback to RFC 793 type estimation
1806 		}
1807 
1808 		if (is_writable(fState)) {
1809 			// notify threads waiting on the socket to become writable again
1810 			fSendList.Signal();
1811 			gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Used());
1812 		}
1813 
1814 		if (fCongestionWindow < fSlowStartThreshold)
1815 			fCongestionWindow += fSendMaxSegmentSize;
1816 	}
1817 
1818 	if (fCongestionWindow >= fSlowStartThreshold) {
1819 		uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize;
1820 
1821 		if (increment < fCongestionWindow)
1822 			increment = 1;
1823 		else
1824 			increment /= fCongestionWindow;
1825 
1826 		fCongestionWindow += increment;
1827 	}
1828 
1829 	// if there is data left to be send, send it now
1830 	if (fSendQueue.Used() > 0)
1831 		_SendQueued();
1832 }
1833 
1834 
1835 void
1836 TCPEndpoint::_Retransmit()
1837 {
1838 	TRACE("Retransmit()");
1839 	_ResetSlowStart();
1840 	fSendNext = fSendUnacknowledged;
1841 	_SendQueued();
1842 }
1843 
1844 
1845 void
1846 TCPEndpoint::_UpdateSRTT(int32 roundTripTime)
1847 {
1848 	int32 rtt = roundTripTime;
1849 
1850 	// Update_SRTT() as per Van Jacobson
1851 	rtt -= (fRoundTripTime / 8);
1852 	fRoundTripTime += rtt;
1853 	if (rtt < 0)
1854 		rtt = -rtt;
1855 	rtt -= (fRoundTripDeviation / 4);
1856 	fRoundTripDeviation += rtt;
1857 
1858 	fRetransmitTimeout = ((fRoundTripTime / 4 + fRoundTripDeviation) / 2)
1859 		* kTimestampFactor;
1860 
1861 	TRACE("  RTO is now %llu (after rtt %ldms)", fRetransmitTimeout,
1862 		roundTripTime);
1863 }
1864 
1865 
1866 void
1867 TCPEndpoint::_ResetSlowStart()
1868 {
1869 	fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged) / 2,
1870 		2 * fSendMaxSegmentSize);
1871 	fCongestionWindow = fSendMaxSegmentSize;
1872 }
1873 
1874 
1875 void
1876 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment)
1877 {
1878 	if (++fDuplicateAcknowledgeCount < 3)
1879 		return;
1880 
1881 	if (fDuplicateAcknowledgeCount == 3) {
1882 		_ResetSlowStart();
1883 		fCongestionWindow = fSlowStartThreshold + 3
1884 			* fSendMaxSegmentSize;
1885 		fSendNext = segment.acknowledge;
1886 	} else if (fDuplicateAcknowledgeCount > 3)
1887 		fCongestionWindow += fSendMaxSegmentSize;
1888 
1889 	_SendQueued();
1890 }
1891 
1892 
1893 //	#pragma mark - timer
1894 
1895 
1896 /*static*/ void
1897 TCPEndpoint::_RetransmitTimer(net_timer *timer, void *data)
1898 {
1899 	TCPEndpoint *endpoint = (TCPEndpoint *)data;
1900 
1901 	MutexLocker locker(endpoint->fLock);
1902 	if (!locker.IsLocked())
1903 		return;
1904 
1905 	endpoint->_Retransmit();
1906 }
1907 
1908 
1909 /*static*/ void
1910 TCPEndpoint::_PersistTimer(net_timer *timer, void *data)
1911 {
1912 	TCPEndpoint *endpoint = (TCPEndpoint *)data;
1913 
1914 	MutexLocker locker(endpoint->fLock);
1915 	if (!locker.IsLocked())
1916 		return;
1917 
1918 	endpoint->_SendQueued(true);
1919 }
1920 
1921 
1922 /*static*/ void
1923 TCPEndpoint::_DelayedAcknowledgeTimer(struct net_timer *timer, void *data)
1924 {
1925 	TCPEndpoint *endpoint = (TCPEndpoint *)data;
1926 
1927 	MutexLocker locker(endpoint->fLock);
1928 	if (!locker.IsLocked())
1929 		return;
1930 
1931 	endpoint->SendAcknowledge(true);
1932 }
1933 
1934 
1935 /*static*/ void
1936 TCPEndpoint::_TimeWaitTimer(struct net_timer *timer, void *data)
1937 {
1938 	TCPEndpoint *endpoint = (TCPEndpoint *)data;
1939 
1940 	gSocketModule->delete_socket(endpoint->socket);
1941 }
1942 
1943